In [36]:
import numpy as np
import pandas as pd
import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from imblearn import pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings(action='ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
data_df = pd.read_csv('amazon_alexa.tsv', delimiter='\t')
data_df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


# Data Preprocessing

In [38]:
corpus = []
for i in range(len(data_df)):
  review = re.sub('[^a-zA-Z]', ' ', data_df['verified_reviews'][i])
  review = review.lower().split()
  ps = PorterStemmer()
  all_stop_words = stopwords.words('english')
  all_stop_words.remove('not')
  review = [ps.stem(word) for word in review if not word  in set(all_stop_words)]
  review = ' '.join(review)
  corpus.append(review)

In [39]:
y = data_df['feedback']

## Split the dataset into the Training set and Test set

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=1)

# Resampling Approach Using Smote
When classes are imbalanced, standard classifiers are usually biased towards the majority class as we have observed in our case using Logistic regression, SVM, and Random Forest.

one way to overcome this issue is to adjust confidence threshold  for predicting testing instances which is usually 0.5 for a binary classifier, However, we will focus in this project on another approach which is resampling our dataset using SMOTE (Synthetic Minority Over-sampling Technique), then we will compare the performance before and after resampling.

Although it is recommended to use a combination of oversampling and undersampling to manage skewed class distribution, we will stick to a focused oversampling approach due to dataset size limitation

In [41]:
Vectorizer = TfidfVectorizer(ngram_range=(1,2)) 
X_train_tf = Vectorizer.fit_transform(X_train)
X_test_tf = Vectorizer.transform(X_test)

In [42]:
X_test_tf.shape

(630, 19726)

# Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression
pipe_log = pipeline.Pipeline([
                        ('dimension', TruncatedSVD(n_components=1000, random_state=1)), # to decrease dimensionality of the sparse matrix
                        ('over', SMOTE(k_neighbors=7, random_state=1)),
                        ('clf', LogisticRegression())
                        ])

In [44]:
params = {'dimension__n_components':[100,500,1000], 
              'over__k_neighbors':[3,5,7]}
grid_search = GridSearchCV(estimator=pipe_log, 
                           param_grid= params, 
                           scoring='balanced_accuracy', 
                           cv=5)
grid_search.fit(X_train_tf, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print('Best Accuracy: {:.2f} %'.format(best_accuracy*100))
print('Best Parameters:', best_parameters)

Best Accuracy: 82.36 %
Best Parameters: {'dimension__n_components': 1000, 'over__k_neighbors': 7}


In [45]:
# train the model
pipe_log.fit(X_train_tf, y_train)
# evaluate on the test set
yhat_log = pipe_log.predict(X_test_tf)
confusion_matrix(y_test, yhat_log)

array([[ 30,  16],
       [ 54, 530]])

In [46]:
print(classification_report(y_test, yhat_log))

              precision    recall  f1-score   support

           0       0.36      0.65      0.46        46
           1       0.97      0.91      0.94       584

    accuracy                           0.89       630
   macro avg       0.66      0.78      0.70       630
weighted avg       0.93      0.89      0.90       630



we can see a major improvement in logistic regression performance after resampling our data set

# Linear Support Vector Machine

In [47]:
from sklearn.svm import SVC
pipe_sv = pipeline.Pipeline([
                        ('dimension', TruncatedSVD(n_components=100, random_state=1)),
                        ('over', SMOTE(k_neighbors=5, random_state=1)),
                        ('clf', SVC(kernel='linear',random_state=1))
                        ])

In [48]:
params = {'dimension__n_components':[100,500,1000], 
              'over__k_neighbors':[3,5,7]}
grid_search = GridSearchCV(estimator=pipe_sv, 
                           param_grid= params, 
                           scoring='balanced_accuracy', 
                           cv=5)
grid_search.fit(X_train_tf, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print('Best Accuracy: {:.2f} %'.format(best_accuracy*100))
print('Best Parameters:', best_parameters)

Best Accuracy: 81.80 %
Best Parameters: {'dimension__n_components': 500, 'over__k_neighbors': 7}


In [49]:
# train the model
pipe_sv.fit(X_train_tf, y_train)
# evaluate on the test set
yhat_sv = pipe_sv.predict(X_test_tf)
confusion_matrix(y_test, yhat_sv)

array([[ 30,  16],
       [ 94, 490]])

In [50]:
print(classification_report(y_test, yhat_sv))

              precision    recall  f1-score   support

           0       0.24      0.65      0.35        46
           1       0.97      0.84      0.90       584

    accuracy                           0.83       630
   macro avg       0.61      0.75      0.63       630
weighted avg       0.92      0.83      0.86       630



# Random Forest Classifier

In [51]:
from sklearn.ensemble import RandomForestClassifier
pipe_rn = pipeline.Pipeline([
                        ('dimension', TruncatedSVD(n_components=100, random_state=1)),
                        ('over', SMOTE(k_neighbors=7, random_state=1)),
                        ('clf', RandomForestClassifier(random_state=1))
                        ])

In [52]:
params = {'dimension__n_components':[100,500,1000], 
              'over__k_neighbors':[3,5,7]}
grid_search = GridSearchCV(estimator=pipe_rn, 
                           param_grid= params, 
                           scoring='balanced_accuracy', 
                           cv=5)
grid_search.fit(X_train_tf, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print('Best Accuracy: {:.2f} %'.format(best_accuracy*100))
print('Best Parameters:', best_parameters)

Best Accuracy: 81.03 %
Best Parameters: {'dimension__n_components': 100, 'over__k_neighbors': 5}


In [53]:
# train the model
pipe_rn.fit(X_train_tf, y_train)
# evaluate on the test set
yhat_rn = pipe_rn.predict(X_test_tf)
print(classification_report(y_test, yhat_rn))

              precision    recall  f1-score   support

           0       0.47      0.43      0.45        46
           1       0.96      0.96      0.96       584

    accuracy                           0.92       630
   macro avg       0.71      0.70      0.70       630
weighted avg       0.92      0.92      0.92       630



In [55]:
header = pd.MultiIndex.from_product([['Original','Oversampling'],
                                     ['Sensitivity %','Specificity %', 'Balanced Accuracy %']],
                                    names=['Case', 'Metrics'])
df = pd.DataFrame([[100, 4, 52, 84, 72, 78],
                   [99, 37, 68, 84, 65, 75],
                   [100, 30, 65, 96, 43, 70]], 
                  index=['Logistic Regression','Linear SVM','Random Forest'], 
                  columns=header)
df

Case,Original,Original,Original,Oversampling,Oversampling,Oversampling
Metrics,Sensitivity %,Specificity %,Balanced Accuracy %,Sensitivity %,Specificity %,Balanced Accuracy %
Logistic Regression,100,4,52,84,72,78
Linear SVM,99,37,68,84,65,75
Random Forest,100,30,65,96,43,70
