In [1]:
import pandas as cd
import numpy as np

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

from cuml.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import seaborn as sns
import re

In [2]:
train_data=cd.read_csv('/kaggle/input/amazon-pet-product-reviews-classification/train.csv').fillna(' ')
valid_data=cd.read_csv('/kaggle/input/amazon-pet-product-reviews-classification/valid.csv').fillna(' ')
test_data=cd.read_csv('/kaggle/input/amazon-pet-product-reviews-classification/test.csv').fillna(' ')

In [3]:
print(train_data.shape)
print(test_data.shape)
train_data.head()

In [4]:
# sns.countplot(cd.Series.to_array(train_data['label']));
sns.countplot(train_data['label'])
plt.title('Target distribution');

In [5]:
from tqdm import tqdm
tqdm.pandas()

In [6]:
def features(df):
    df=df.copy()
    stops = stopwords.words('english')
    stemmer = PorterStemmer()
    
    # removing special characters
    df['prepared_text'] = df['text'].apply(lambda text: re.sub('[^A-Za-z]', ' ', text))
    # transform text to lowercase
    df['prepared_text'] = df['prepared_text'].str.lower()
    # tokenize the texts
    df['prepared_text'] = df['prepared_text'].apply(lambda text: word_tokenize(text))
    # removing stopwords
    df['prepared_text'] = df['prepared_text'].apply(lambda words: [word for word in words if word not in stops])
    # stemming
    df['prepared_text'] = df['prepared_text'].apply(lambda words: [stemmer.stem(word) for word in words])
    # join prepared+text to use as corpus
    df['joined_prepared_text'] = df['prepared_text'].apply(lambda words: " ".join(words))

    return df

In [7]:
train=features(train_data)
train.head()

In [8]:
test=features(test_data)
test.head()

In [9]:
corpus = train['joined_prepared_text'].values
corpus[:2]

In [10]:
td = TfidfVectorizer(max_features=10000)

In [11]:
x = td.fit_transform(corpus)
x.shape

In [12]:
labels_to_ids = {}
ids_to_labels = {}
for i, label in enumerate(sorted(train_data['label'].unique())):
    labels_to_ids[label] = i
    ids_to_labels[i] = label
    
labels_to_ids, ids_to_labels

In [13]:
y = train['label'].map(labels_to_ids).values
y

In [14]:
clf=LogisticRegression(C=5e1, solver='qn',max_iter=400).fit(x,y)

Training Metrics

In [15]:
y_pred = clf.predict(x)

In [16]:
labels = list(ids_to_labels.values())

In [17]:
print(classification_report(y, y_pred, target_names=labels))

Validation Metrics

In [19]:
val = features(valid_data)

In [20]:
val_corpus = val['joined_prepared_text'].values
val_corpus[:2]

In [21]:
x_val = td.transform(val_corpus)
x_val.shape

In [22]:
y_true = val['label'].map(labels_to_ids).values
y_pred = clf.predict(x_val)

In [23]:
print(classification_report(y_true, y_pred, target_names=labels))

#### Submission

In [24]:
sample_submission = cd.read_csv('/kaggle/input/amazon-pet-product-reviews-classification/sample_submission.csv')

In [25]:
test_corpus = test['joined_prepared_text'].values
test_corpus[:2]

In [26]:
x_test = td.transform(test_corpus)
x_test.shape

In [27]:
y_predicted = clf.predict(x_test)

In [28]:
y_predicted.shape

In [30]:
sample_submission['label'] = y_predicted

In [31]:
sample_submission['label'] = sample_submission['label'].map(ids_to_labels)

In [36]:
sample_submission.to_csv('submission.csv', index=None, header=True)