In [1]:
import pandas as pd
import numpy as np

import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from boruta import BorutaPy as boruta
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
data = pd.read_csv('sample30.csv', index_col=None)
data.shape

In [None]:
print(data['manufacturer'].nunique())
print(data['name'].nunique())
print(data['categories'].nunique())
print(data['brand'].nunique())

In [None]:
df = data[(data['reviews_username'].isnull()==False) & (data['user_sentiment'].isnull()==False)]
df.info()

In [None]:
df['user_sentiment'].value_counts()

In [None]:
df = df[['id', 'brand', 'categories', 'manufacturer', 'name', 'reviews_text', 'reviews_title', 'user_sentiment']]
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['reviews_title'] = df['reviews_title'].astype('O')
cnt = [i for i in df['reviews_title'].to_list() if isinstance(i, float)]
print(cnt)
cnt = [i for i in df['reviews_text'].to_list() if i.isnumeric()]
print(cnt)

In [None]:
df['reviews_title'] = df['reviews_title'].fillna('NF')
df = df.reset_index(drop=True)

In [None]:
for i in range(len(df)):
  if df.loc[i,'reviews_title'] == 'NF':
    if df.loc[i, 'user_sentiment'] == 'Positive':
      df.loc[i,'reviews_title']='Good'
    if df.loc[i, 'user_sentiment'] == 'Negative':
      df.loc[i,'reviews_title']='Bad'

df['manufacturer'].fillna(df['brand'], inplace=True)

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# add stemming and lemmatisation in the preprocess function
def preprocess(document):
    'changes document to lower case and removes stopwords'
    document = document.lower()
    words = word_tokenize(document)
    words = [word for word in words if word not in stopwords.words("english")]
    words = [stemmer.stem(word) for word in words]
    document = " ".join(words)
    return document

In [None]:
df['reviews_text'] = df['reviews_title']+' '+df['reviews_text']
df['reviews_text'][0]

In [None]:
df['proc_reviews_text'] = df['reviews_text'].apply(preprocess)
df.shape

In [None]:
# df.to_csv('final_data.csv', index=None)

In [3]:
df = pd.read_csv('../Details/dataset/SentimentbasedRecoEngine/final_data.csv', index_col=None)
df.head(2)

Unnamed: 0,id,brand,categories,manufacturer,name,reviews_text,reviews_title,user_sentiment,proc_reviews_text
0,AV13O1A8GV-KLJ3akUyj,Universal Music,"Movies, Music & Books,Music,R&b,Movies & TV,Mo...",Universal Music Group / Cash Money,Pink Friday: Roman Reloaded Re-Up (w/dvd),Just Awesome i love this album. it's very good...,Just Awesome,Positive,awesom love album . 's good . hip hop side cur...
1,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,Good Good flavor. This review was collected as...,Good,Positive,good good flavor . review collect part promot .


In [4]:
df['user_sentiment'] = df['user_sentiment'].map({'Positive':1, 'Negative':0})
df['user_sentiment'].value_counts()

1    26579
0     3357
Name: user_sentiment, dtype: int64

In [5]:
X, y = df['proc_reviews_text'], df['user_sentiment']
xtrain, xtest, ytrain, ytest = train_test_split(df['proc_reviews_text'], df['user_sentiment'], test_size=0.3)

## Feature Creation
1. TF-IDF Vectorizer 
2. Bag-of-Words 

In [5]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_model = tfidf_vectorizer.fit(xtrain)

xtrain_tfi = tfidf_model.transform(xtrain)
xtest_tfi  = tfidf_model.transform(xtest)

xdf = pd.DataFrame(xtrain_tfi.toarray(), columns=tfidf_vectorizer.get_feature_names())
print(xdf.shape)
xdf.head(2)

(20955, 11253)


Unnamed: 0,00,000,0000,007,04,079,09,10,100,1000,...,zojirushi,zombi,zombie,zone,zoo,zorba,zow,zucchetta,zucchini,zyrtec
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
bow_vectorizer = CountVectorizer(stop_words='english', max_features=9000)
bow_model = bow_vectorizer.fit(xtrain)

xtrain_bow = bow_model.transform(xtrain)
xtest_bow  = bow_model.transform(xtest)

xdf_bow = pd.DataFrame(xtrain_bow.toarray(), columns=bow_vectorizer.get_feature_names())
print(xdf_bow.shape)
xdf_bow.head(2)

(20955, 9000)


Unnamed: 0,00,000,0000,007,04,079,09,10,100,1000,...,zipper,zit,zojirushi,zombi,zombie,zone,zoo,zorba,zow,zucchetta
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
xtrain_tfi.shape, xtest_tfi.shape, xtrain_bow.shape, xtest_bow.shape

((20955, 11253), (8981, 11253), (20955, 9000), (8981, 9000))

## Class Imbalance Fix
1. Fix class imbalance for only xtrain after fitting on xtrain and transforming train, test separately
2. Fix class imbalance for whole data after applying bag-of-words fit_transform on entire data

In [13]:
oversample = SMOTE()
xtrain_bow2, ytrain2 = oversample.fit_resample(xtrain_bow, ytrain)

In [14]:
ytrain2.value_counts()

0    18637
1    18637
Name: user_sentiment, dtype: int64

In [15]:
xtrain_bow2.shape

(37274, 9000)

In [6]:
bow_vectorizer2 = CountVectorizer(stop_words='english', max_features=9000)
X_trans = bow_vectorizer2.fit_transform(X)
Xdf = pd.DataFrame(X_trans.toarray(), columns=bow_vectorizer2.get_feature_names())
Xdf.shape

(29936, 9000)

In [7]:
oversample = SMOTE()
X2, y2 = oversample.fit_resample(Xdf, y)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X2, y2, test_size=0.3)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((37210, 9000), (15948, 9000), (37210,), (15948,))

## Modelling
1. Model Building with raw processed data without class imbalance
2. Model Building with fixed Class Imbalance

### Logistic Regression Model

In [19]:
logreg1 = LogisticRegression(penalty="l2", random_state=42, C=3.5)
logreg1.fit(xtrain_tfi, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=3.5, random_state=42)

In [20]:
pred_train = logreg1.predict(xtrain_tfi)
pred_test  = logreg1.predict(xtest_tfi)
print("Train accuracy: {} . Test accuracy: {}".format(accuracy_score(pred_train, ytrain), accuracy_score(pred_test, ytest)))
print("Train AUC: {}. Test AUC: {}".format(roc_auc_score(pred_train, ytrain), roc_auc_score(pred_test, ytest)))

Train accuracy: 0.8899546647578144 . Test accuracy: 0.8838659392049883
Train AUC: 0.9449458052810008. Test AUC: 0.5255060352831941


In [22]:
logreg2 = LogisticRegression(penalty="l2", random_state=42, C=3.5)
logreg2.fit(xtrain_bow, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=3.5, random_state=42)

In [23]:
pred_train2 = logreg2.predict(xtrain_bow)
pred_test2  = logreg2.predict(xtest_bow)
print("Train accuracy (BOW): {} . Test accuracy (BOW): {}".format(accuracy_score(pred_train2, ytrain), accuracy_score(pred_test2, ytest)))
print("Train AUC (BOW): {}. Test AUC (BOW): {}".format(roc_auc_score(pred_train2, ytrain), roc_auc_score(pred_test2, ytest)))

Train accuracy (BOW): 0.9137675972321642 . Test accuracy (BOW): 0.8636009353078722
Train AUC (BOW): 0.9220649250256694. Test AUC (BOW): 0.4949901154170422


In [24]:
logreg3 = LogisticRegression(penalty="l2", random_state=42, C=3.5)
logreg3.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=3.5, random_state=42)

In [26]:
pred_train3 = logreg3.predict(x_train)
pred_test3  = logreg3.predict(x_test)
print("Train accuracy : {} . Test accuracy : {}".format(accuracy_score(pred_train3, y_train), accuracy_score(pred_test3, y_test)))
print("Train AUC : {}. Test AUC : {}".format(roc_auc_score(pred_train3, y_train), roc_auc_score(pred_test3, y_test)))

Train accuracy : 0.9661381349099705 . Test accuracy : 0.935415099071984
Train AUC : 0.9661590160836603. Test AUC : 0.9355695514113789


### Naive Bayes Model

In [27]:
bnb = BernoulliNB()
bnb.fit(xtrain_bow, ytrain)

BernoulliNB()

In [28]:
pred_train_nb1 = bnb.predict(xtrain_bow)
pred_test_nb1  = bnb.predict(xtest_bow)
print("Train accuracy : {} . Test accuracy : {}".format(accuracy_score(pred_train_nb1, ytrain), accuracy_score(pred_test_nb1, ytest)))
print("Train AUC : {}. Test AUC : {}".format(roc_auc_score(pred_train_nb1, ytrain), roc_auc_score(pred_test_nb1, ytest)))

Train accuracy : 0.868766404199475 . Test accuracy : 0.8549159336376796
Train AUC : 0.5962447251629714. Test AUC : 0.49310561705719863


In [29]:
bnb = BernoulliNB()
bnb.fit(x_train, y_train)

BernoulliNB()

In [31]:
pred_train_nb2 = bnb.predict(x_train)
pred_test_nb2  = bnb.predict(x_test)
print("Train accuracy : {} . Test accuracy : {}".format(accuracy_score(pred_train_nb2, y_train), accuracy_score(pred_test_nb2, y_test)))
print("Train AUC : {}. Test AUC : {}".format(roc_auc_score(pred_train_nb2, y_train), roc_auc_score(pred_test_nb2, y_test)))

Train accuracy : 0.8478903520558989 . Test accuracy : 0.8422999749184851
Train AUC : 0.8549001554334769. Test AUC : 0.8489595210626223


### XGBoost Classifier

In [32]:
xgb = XGBClassifier()
xgb.fit(xtrain_bow, ytrain)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [33]:
pred_train_xg1 = xgb.predict(xtrain_bow)
pred_test_xg1  = xgb.predict(xtest_bow)
print("Train accuracy : {} . Test accuracy : {}".format(accuracy_score(pred_train_xg1, ytrain), accuracy_score(pred_test_xg1, ytest)))
print("Train AUC : {}. Test AUC : {}".format(roc_auc_score(pred_train_xg1, ytrain), roc_auc_score(pred_test_xg1, ytest)))

Train accuracy : 0.8928656645192078 . Test accuracy : 0.8836432468544706
Train AUC : 0.9332961558988956. Test AUC : 0.5851455336232854


In [35]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [37]:
pred_train_xg2 = xgb.predict(x_train)
pred_test_xg2  = xgb.predict(x_test)
print("Train accuracy : {} . Test accuracy : {}".format(accuracy_score(pred_train_xg2, y_train), accuracy_score(pred_test_xg2, y_test)))
print("Train AUC : {}. Test AUC : {}".format(roc_auc_score(pred_train_xg2, y_train), roc_auc_score(pred_test_xg2, y_test)))

Train accuracy : 0.9414404729911314 . Test accuracy : 0.9245046400802609
Train AUC : 0.9420472616162203. Test AUC : 0.9252430234248417


### Random Forest Model

In [39]:
rf1 = RandomForestClassifier()
rf1.fit(xtrain_bow, ytrain)

pred_train_rf1 = rf1.predict(xtrain_bow)
pred_test_rf1  = rf1.predict(xtest_bow)

print("Train accuracy : {} . Test accuracy : {}".format(accuracy_score(pred_train_rf1, ytrain), accuracy_score(pred_test_rf1, ytest)))
print("Train AUC : {}. Test AUC : {}".format(roc_auc_score(pred_train_rf1, ytrain), roc_auc_score(pred_test_rf1, ytest)))

Train accuracy : 0.9876879026485326 . Test accuracy : 0.872954014029618
Train AUC : 0.9887192855657293. Test AUC : 0.5074881139097985


In [9]:
rf2 = RandomForestClassifier()
rf2.fit(x_train, y_train)

pred_train_rf2 = rf2.predict(x_train)
pred_test_rf2  = rf2.predict(x_test)

print("Train accuracy : {} . Test accuracy : {}".format(accuracy_score(pred_train_rf2, y_train), accuracy_score(pred_test_rf2, y_test)))
print("Train AUC : {}. Test AUC : {}".format(roc_auc_score(pred_train_rf2, y_train), roc_auc_score(pred_test_rf2, y_test)))

Train accuracy : 0.9994893845740392 . Test accuracy : 0.9398670679709055
Train AUC : 0.9994906166219839. Test AUC : 0.9400650333175474


In [10]:
rf2.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Hyper-parameter Optimization:
- Random Forest (Best Performing Model)
- To Prevent Overfitting
- Using Random Search with Cross Validation

In [9]:
from sklearn.model_selection import RandomizedSearchCV


n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num=10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [10]:
rf = RandomForestClassifier(random_state=42)

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                          n_iter=30, scoring='accuracy', 
                          cv=3, verbose=2, random_state=42, n_jobs=-1,
                          return_train_score=True)

# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


MemoryError: Unable to allocate 1.66 GiB for an array with shape (9000, 24806) and data type int64

In [None]:
rf_random.best_params_