#### Importing Libraries

In [None]:
import spacy
import pandas as pd

#### Loading the dataset

In [None]:
df = pd.read_csv('amazon.csv')
df.head()

Unnamed: 0,Text,label
0,This is the best apps acording to a bunch of ...,1
1,This is a pretty good version of the game for ...,1
2,this is a really . there are a bunch of levels...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


In [None]:
df.label.value_counts()  #uniques value counts

label
1    15230
0     4766
Name: count, dtype: int64

In [None]:
df.isnull().sum() # finding null values

Text     0
label    0
dtype: int64

#### Sampling

In [None]:
min_samples = 4766

df_positive = df[df.label==1].sample(min_samples, random_state=2022)
df_negative = df[df.label==-0].sample(min_samples, random_state=2022)    # sampling the data for balance

In [None]:
df = pd.concat([df_positive, df_negative],axis=0)
df.label.value_counts()

label
1    4766
0    4766
Name: count, dtype: int64

In [None]:
df.shape #final shape of the dataset after sampling

(9532, 2)

In [None]:
!spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

#### Preprocessing the Data

In [None]:
import re
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):

    return TAG_RE.sub('', text) #Removes HTML tags: replaces anything between opening and closing <> with empty space

def preprocess(text):
    text = text.lower()  # Convert text to lowercase
    text = remove_tags(text)

    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)  # the apostrophe is replaced by an empty space and left with single character "s" that we are removing here.

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [None]:
df['preprocessed_txt'] = df['Text'].apply(preprocess)

In [None]:
df.head()

Unnamed: 0,Text,label,preprocessed_txt
5716,the recent updates have made this app amazing ...,1,recent update app amazing great work new widge...
15653,"This is, by far, the best newspaper app I have...",1,far good newspaper app see paper want include ...
2718,I chose this rating because I enjoy reading ab...,1,choose rating enjoy read news tablet read news...
9489,For doing multiple file operations particularl...,1,multiple file operation particularly useful ja...
4783,"A wonderful aid for anyone, young or old prepa...",1,wonderful aid young old prepare confession esp...


In [None]:
df.shape

(9532, 3)

#### Train-Test splitting of the data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_txt,
    df.label,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label
)

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (7625,)
Shape of X_test:  (1907,)


#### Model Training and Evaluation

##### Random Forest

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tri_grams', CountVectorizer(ngram_range = (3, 3))),                       #using the ngram_range parameter
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.98      0.70       953
           1       0.91      0.19      0.31       954

    accuracy                           0.58      1907
   macro avg       0.73      0.58      0.51      1907
weighted avg       0.73      0.58      0.51      1907



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Random Forest', RandomForestClassifier())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       953
           1       0.85      0.81      0.83       954

    accuracy                           0.84      1907
   macro avg       0.84      0.84      0.84      1907
weighted avg       0.84      0.84      0.84      1907



##### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_bigrams', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86       953
           1       0.85      0.87      0.86       954

    accuracy                           0.86      1907
   macro avg       0.86      0.86      0.86      1907
weighted avg       0.86      0.86      0.86      1907



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86       953
           1       0.85      0.86      0.86       954

    accuracy                           0.86      1907
   macro avg       0.86      0.86      0.86      1907
weighted avg       0.86      0.86      0.86      1907



##### Support Vector Classifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 1. Create a pipeline object with TfidfVectorizer and SVM classifier
clf1 = Pipeline([
     ('vectorizer_tfidf', TfidfVectorizer()),
     ('svm', SVC(kernel='linear'))
])

# 2. Fit the pipeline with X_train and y_train
clf1.fit(X_train, y_train)

# 3. Get the predictions for X_test and store it in y_pred
y_pred = clf1.predict(X_test)

# 4. Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       953
           1       0.87      0.84      0.86       954

    accuracy                           0.86      1907
   macro avg       0.86      0.86      0.86      1907
weighted avg       0.86      0.86      0.86      1907



##### K Nearest Neighbour

In [None]:
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# 1. Create a pipeline object with TfidfVectorizer and SVM classifier
clf = Pipeline([
     ('vectorizer_tfidf', TfidfVectorizer()),
      ('knn', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])

# 2. Fit the pipeline with X_train and y_train
clf.fit(X_train, y_train)

# 3. Get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

# 4. Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.96      0.75       953
           1       0.91      0.41      0.56       954

    accuracy                           0.68      1907
   macro avg       0.76      0.68      0.66      1907
weighted avg       0.76      0.68      0.66      1907



##### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# 1. Create a pipeline object with TfidfVectorizer and Gradient Boosting classifier
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])

# 2. Fit the pipeline with X_train and y_train
clf.fit(X_train, y_train)

# 3. Get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

# 4. Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.76      0.88      0.81       953
           1       0.85      0.72      0.78       954

    accuracy                           0.80      1907
   macro avg       0.80      0.80      0.79      1907
weighted avg       0.80      0.80      0.79      1907



##### Trial with LSTM (*needs improvization*)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Activation, Flatten, GlobalMaxPooling1D, Conv1D

In [None]:
import numpy as np
# Define parameters
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df['preprocessed_txt'])
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(df['preprocessed_txt'])
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(padded, df['label'], test_size=0.2, random_state=42, stratify=df['label'])

# Convert labels to numpy array
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64)),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 16)           160000    
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               41472     
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 24)                3096      
                                                                 
 dense_3 (Dense)             (None, 1)                 25        
                                                                 
Total params: 204593 (799.19 KB)
Trainable params: 204593 (799.19 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), verbose=2)

Epoch 1/10
239/239 - 30s - loss: 0.4628 - accuracy: 0.7709 - val_loss: 0.3594 - val_accuracy: 0.8406 - 30s/epoch - 125ms/step
Epoch 2/10
239/239 - 27s - loss: 0.2451 - accuracy: 0.9014 - val_loss: 0.3653 - val_accuracy: 0.8374 - 27s/epoch - 115ms/step
Epoch 3/10
239/239 - 25s - loss: 0.1685 - accuracy: 0.9390 - val_loss: 0.4299 - val_accuracy: 0.8327 - 25s/epoch - 105ms/step
Epoch 4/10
239/239 - 25s - loss: 0.1222 - accuracy: 0.9576 - val_loss: 0.4685 - val_accuracy: 0.8217 - 25s/epoch - 103ms/step
Epoch 5/10
239/239 - 23s - loss: 0.0927 - accuracy: 0.9679 - val_loss: 0.5448 - val_accuracy: 0.8128 - 23s/epoch - 96ms/step
Epoch 6/10
239/239 - 25s - loss: 0.0929 - accuracy: 0.9666 - val_loss: 0.6908 - val_accuracy: 0.8144 - 25s/epoch - 103ms/step
Epoch 7/10
239/239 - 25s - loss: 0.0706 - accuracy: 0.9748 - val_loss: 0.6929 - val_accuracy: 0.8128 - 25s/epoch - 104ms/step
Epoch 8/10
239/239 - 24s - loss: 0.0566 - accuracy: 0.9826 - val_loss: 0.7294 - val_accuracy: 0.8076 - 24s/epoch - 101m

In [None]:
# Get predictions for the test set
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype("int32")

# Print classification report
print(classification_report(y_test, y_pred_classes))

              precision    recall  f1-score   support

           0       0.82      0.79      0.81       954
           1       0.80      0.82      0.81       953

    accuracy                           0.81      1907
   macro avg       0.81      0.81      0.81      1907
weighted avg       0.81      0.81      0.81      1907



##### Saving the better performing model

In [None]:
import pickle

# Save the model to a file
with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(clf1, f)

In [None]:
df.to_csv('amazon_clean_data.csv',index=False)