In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import nltk.corpus

# NLTK Libraries
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

%autosave 300

Autosaving every 300 seconds


In [2]:

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train.shape)
print(test.shape)
train.head()


(5959, 3)
(2553, 2)


Unnamed: 0,Review Text,Review Title,topic
0,"Did nothing for me, didn't help lost even with...",Useless,Shipment and delivery
1,"Did nothing for me, didn't help lost even with...",Useless,Not Effective
2,I have bought these bags and immediately open...,TRASH!!! Do not buy these bags it’s a waist of...,Customer Service
3,Gave me an allergic reaction on my face :(,Do not recommend,Allergic
4,These don't compare to the name brand wipes. F...,Can't tackle big messes,Texture


In [3]:
train.describe()

Unnamed: 0,Review Text,Review Title,topic
count,5959,5959,5959
unique,4196,3727,21
top,Their website would have you believe that only...,Gross,Bad Taste/Flavor
freq,6,19,1194


In [4]:
train["topic"].value_counts()

Bad Taste/Flavor           1194
Quality/Contaminated        715
Not Effective               611
Allergic                    567
Packaging                   467
Texture                     410
Shipment and delivery       390
Customer Service            239
Color and texture           234
Too big to swallow          228
Ingredients                 216
Expiry                      136
Smells Bad                  123
Pricing                     107
Wrong Product received       99
Too Sweet                    97
Inferior to competitors      44
False Advertisement          37
Didn't Like                  31
Customer Issues               8
Hard to Chew                  6
Name: topic, dtype: int64

In [5]:
topics = train.topic.unique().tolist()

In [6]:
print(topics)

['Shipment and delivery', 'Not Effective', 'Customer Service', 'Allergic', 'Texture', 'Quality/Contaminated', 'Color and texture', 'Bad Taste/Flavor', 'Too big to swallow', 'Smells Bad', 'Too Sweet', 'Ingredients', 'Expiry', 'Packaging', 'Wrong Product received', 'Pricing', 'False Advertisement', 'Inferior to competitors', "Didn't Like", 'Customer Issues', 'Hard to Chew']


In [7]:
l = list(range(21))
labels = {i : topics[i] for i in range(0, len(topics))}
d = dict(zip(topics, l))
print(labels)
print(d)

{0: 'Shipment and delivery', 1: 'Not Effective', 2: 'Customer Service', 3: 'Allergic', 4: 'Texture', 5: 'Quality/Contaminated', 6: 'Color and texture', 7: 'Bad Taste/Flavor', 8: 'Too big to swallow', 9: 'Smells Bad', 10: 'Too Sweet', 11: 'Ingredients', 12: 'Expiry', 13: 'Packaging', 14: 'Wrong Product received', 15: 'Pricing', 16: 'False Advertisement', 17: 'Inferior to competitors', 18: "Didn't Like", 19: 'Customer Issues', 20: 'Hard to Chew'}
{'Shipment and delivery': 0, 'Not Effective': 1, 'Customer Service': 2, 'Allergic': 3, 'Texture': 4, 'Quality/Contaminated': 5, 'Color and texture': 6, 'Bad Taste/Flavor': 7, 'Too big to swallow': 8, 'Smells Bad': 9, 'Too Sweet': 10, 'Ingredients': 11, 'Expiry': 12, 'Packaging': 13, 'Wrong Product received': 14, 'Pricing': 15, 'False Advertisement': 16, 'Inferior to competitors': 17, "Didn't Like": 18, 'Customer Issues': 19, 'Hard to Chew': 20}


In [8]:
train['labels'] = train["topic"].apply(lambda x: d[x])
train.head()

Unnamed: 0,Review Text,Review Title,topic,labels
0,"Did nothing for me, didn't help lost even with...",Useless,Shipment and delivery,0
1,"Did nothing for me, didn't help lost even with...",Useless,Not Effective,1
2,I have bought these bags and immediately open...,TRASH!!! Do not buy these bags it’s a waist of...,Customer Service,2
3,Gave me an allergic reaction on my face :(,Do not recommend,Allergic,3
4,These don't compare to the name brand wipes. F...,Can't tackle big messes,Texture,4


In [9]:
all_data = pd.concat([train, test], ignore_index=False)
print(all_data.shape)
# print(all_data[5965][0][0])

(8512, 4)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [10]:
titles = []
reviews= []

for i in all_data['Review Title']:
    titles.append(i)

for line in all_data['Review Text']:
    reviews.append(line)

print(len(titles))
print(len(reviews))

8512
8512


In [11]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

titles_clean = preprocess_reviews(titles)
reviews_clean = preprocess_reviews(reviews)

In [12]:
X_train = titles_clean[:train.shape[0]]
X_test = titles_clean[train.shape[0]:]
print(len(X_train), len(X_test))

5959 2553


In [13]:
Y = []

for i in train['labels']:
    Y+=[i]
    
print(len(Y))

5959


In [14]:
stop_words = ['in', 'of', 'at', 'a', 'the']

# Not using stop words

v = CountVectorizer()
v.fit(X_train)

X_train_bow = v.transform(X_train)
# X_val_bow = v.transform(X_val)
X_test_bow = v.transform(X_test)

print(X_train_bow.shape)
print(X_test_bow.shape)

(5959, 2485)
(2553, 2485)


In [15]:
c = LogisticRegression(C=0.05)
c.fit(X_train_bow, Y)

score = c.score(X_train_bow, Y)
print("Val: LR:S: ", score)

Val: LR:S:  0.4321194831347542




In [16]:
test_predict = c.predict(X_test_bow)
print(len(test_predict))

2553


In [17]:
print(test_predict.shape)
t = test_predict.tolist()
# print(t)
# new_labels = t.apply(lambda x: labels[x])
# print(new_labels)
# print(labels.keys())
# print(labels.values())
final = []
for i in t:
    final.append(labels[i])
#     if i in labels.keys():
#         final.append()
# print(final)

(2553,)


In [18]:
submission = pd.DataFrame({
    'Review Text': test['Review Text'],
    'Review Title':test['Review Title'],
    'topic': final
})

In [19]:
submission.head()

Unnamed: 0,Review Text,Review Title,topic
0,I use chia seed in my protein shakes. These ta...,Bad tast,Quality/Contaminated
1,I use chia seed in my protein shakes. These ta...,Bad tast,Quality/Contaminated
2,Don’t waste your money.,No change. No results.,Not Effective
3,I use the book 'Fortify Your Life' by Tieraona...,"Good Vegan Choice, Poor Non Vegan Choice",Quality/Contaminated
4,I use the book 'Fortify Your Life' by Tieraona...,"Good Vegan Choice, Poor Non Vegan Choice",Quality/Contaminated


In [20]:
# submission.to_csv('submit.csv', index=False)

In [21]:
X_trainr = reviews_clean[:train.shape[0]]
# X_val = justifications_clean[train.shape[0]:train.shape[0]+val.shape[0]]
X_testr = reviews_clean[train.shape[0]:]
# print(len(X_train), len(X_val), len(X_test))

stop_words = ['in', 'of', 'at', 'a', 'the']

# Not using stop words

v = CountVectorizer()
v.fit(X_trainr)

X_train_bowr = v.transform(X_trainr)
# X_val_bow = v.transform(X_val)
X_test_bowr = v.transform(X_testr)

print(X_train_bowr.shape)
print(X_test_bowr.shape)

(5959, 8509)
(2553, 8509)


In [22]:
c = LogisticRegression(C=0.05)
c.fit(X_train_bowr, Y)

score = c.score(X_train_bowr, Y)
print("Val: LR:S: ", score)
test_predict = c.predict(X_test_bowr)
print(len(test_predict))



Val: LR:S:  0.6301392851149522
2553


In [23]:
t = test_predict.tolist()
final = []
for i in t:
    final.append(labels[i])

In [24]:
submission = pd.DataFrame({
    'Review Text': test['Review Text'],
    'Review Title':test['Review Title'],
    'topic': final
})

In [25]:
# submission.to_csv('submit1.csv', index=False)

In [26]:
rt_clean = []

for i in range(len(titles_clean)):
    rt_clean += [titles_clean[i] + SPACE +  reviews_clean[i]]

print(len(rt_clean))

8512


In [27]:
X_train = rt_clean[:train.shape[0]]
# X_val = s_j_clean[train.shape[0]:train.shape[0]+val.shape[0]]
X_test = rt_clean[train.shape[0]:]
print(len(X_train), len(X_test))

5959 2553


In [39]:
# v = CountVectorizer()
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")

bow.fit(X_train)

X_train_bow = bow.transform(X_train)
# X_val_bow = v.transform(X_val)
X_test_bow = bow.transform(X_test)

print(X_train_bow.shape)
print(X_test_bow.shape)

(5959, 1000)
(2553, 1000)


In [40]:
c = LogisticRegression(C=0.05)
c.fit(X_train_bow, Y)

score = c.score(X_train_bow, Y)
print("Val: LR:S: ", score)
test_predict = c.predict(X_test_bow)
print(len(test_predict))

t = test_predict.tolist()
final = []
for i in t:
    final.append(labels[i])



Val: LR:S:  0.6554791072327572
2553


In [41]:
submission = pd.DataFrame({
    'Review Text': test['Review Text'],
    'Review Title':test['Review Title'],
    'topic': final
})

# submission.to_csv('submit2.csv', index=False)

In [54]:
# import LogisticRegression model in python. 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, accuracy_score

## call on the model object
logreg = LogisticRegression(solver='liblinear')

## fit the model with "train_x" and "train_y"
logreg.fit(X_train_bow,Y)

## Once the model is trained we want to find out how well the model is performing, so we test the model. 
## we use "test_x" portion of the data(this data was not used to fit the model) to predict model outcome. 
y_pred = logreg.predict(X_test_bow)
y = logreg.predict(X_train_bow)
## Once predicted we save that outcome in "y_pred" variable.
## Then we compare the predicted value( "y_pred") and actual value("test_y") to see how well our model is performing. 

print ("So, Our accuracy Score is: {}".format(round(accuracy_score(y, Y),4)))



So, Our accuracy Score is: 0.701


In [57]:
print(len(y_pred))
final = []
for i in t:
    final.append(labels[i])

2553


In [58]:
submission = pd.DataFrame({
    'Review Text': test['Review Text'],
    'Review Title':test['Review Title'],
    'topic': final
})

submission.to_csv('submit4.csv', index=False)

In [47]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
## C_vals is the alpla value of lasso and ridge regression(as alpha increases the model complexity decreases,)
## remember effective alpha scores are 0<alpha<infinity 
C_vals = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,16.5,17,17.5,18]

## Choosing penalties(Lasso(l1) or Ridge(l2))
penalties = ['l1','l2']
## Choose a cross validation strategy. 
cv = StratifiedShuffleSplit(n_splits = 5, test_size = .25)

## setting param for param_grid in GridSearchCV. 
# param = {'penalty': penalties, 'C': C_vals}
param = {'C': C_vals}
logreg = LogisticRegression(solver='liblinear')
## Calling on GridSearchCV object. 
grid = GridSearchCV(estimator=LogisticRegression(), 
                           param_grid = param,
                           scoring = 'accuracy',
                            n_jobs =-1,
                           cv = cv
                          )
## Fitting the model
grid.fit(X_train_bow, Y)



GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=None, test_size=0.25,
            train_size=None),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3,
                               4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 16.5,
                               17, 17.5, 18]},
             pre_dispatch='2*

In [48]:
## Getting the best of everything. 
print (grid.best_score_)
print (grid.best_params_)
print(grid.best_estimator_)

0.5005369127516779
{'C': 0.2}
LogisticRegression(C=0.2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [49]:
logreg_grid = grid.best_estimator_
logreg_grid.score(X_train_bow,Y)

0.6856855177043129

In [52]:
# kNN with GridSearch CV
## Importing the model. 
from sklearn.neighbors import KNeighborsClassifier
## calling on the model oject. 
knn = KNeighborsClassifier(metric='minkowski', p=2)
## knn classifier works by doing euclidian distance 


## doing 10 fold staratified-shuffle-split cross validation 
cv = StratifiedShuffleSplit(n_splits=10, test_size=.25, random_state=2)

accuracies = cross_val_score(knn, X_train_bow,Y, cv = cv, scoring='accuracy')
print ("Cross-Validation accuracy scores:{}".format(accuracies))
print ("Mean Cross-Validation accuracy score: {}".format(round(accuracies.mean(),3)))

Cross-Validation accuracy scores:[0.32348993 0.33758389 0.34697987 0.34563758 0.33221477 0.3557047
 0.33355705 0.35167785 0.3590604  0.35100671]
Mean Cross-Validation accuracy score: 0.344


In [53]:
## Search for an optimal value of k for KNN. MANUAL
k_range = range(1,31)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_bow,Y, cv = cv, scoring = 'accuracy')
    k_scores.append(scores.mean())
print("Accuracy scores are: {}\n".format(k_scores))
print ("Mean accuracy score: {}".format(np.mean(k_scores)))

Accuracy scores are: [0.25724832214765103, 0.27818791946308724, 0.3093959731543624, 0.32932885906040266, 0.3436912751677852, 0.35583892617449664, 0.35946308724832216, 0.363758389261745, 0.36979865771812076, 0.3710067114093959, 0.3704697986577181, 0.376510067114094, 0.37852348993288587, 0.37926174496644294, 0.37993288590604024, 0.37932885906040265, 0.380738255033557, 0.38120805369127525, 0.38093959731543625, 0.3822147651006712, 0.3818791946308725, 0.38167785234899326, 0.38140939597315443, 0.3824832214765101, 0.38288590604026845, 0.3808724832214765, 0.38060402684563754, 0.3805369127516779, 0.3789261744966443, 0.37751677852348997]

Mean accuracy score: 0.3645212527964205


In [60]:
# K Neighbours Classifier with gridSearch CV

from sklearn.model_selection import GridSearchCV
## trying out multiple values for k
k_range = range(1,31)
## 
weights_options=['uniform','distance']
# 
param = {'n_neighbors':k_range, 'weights':weights_options}
## Using startifiedShufflesplit. 
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)
# estimator = knn, param_grid = param, n_jobs = -1 to instruct scikit learn to use all available processors. 
grid = GridSearchCV(KNeighborsClassifier(), param,cv=cv,verbose = False, n_jobs=-1)
## Fitting the model. 
grid.fit(X_train_bow,Y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15, test_size=0.3,
            train_size=None),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'n_neighbors': range(1, 31),
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=False)

In [61]:
print (grid.best_score_)
print (grid.best_params_)
print(grid.best_estimator_)

0.3851230425055928
{'n_neighbors': 25, 'weights': 'uniform'}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=25, p=2,
                     weights='uniform')


In [62]:
knn_grid= grid.best_estimator_
knn_grid.score(X_train_bow,Y)

0.4494042624601443

In [63]:
X = X_train_bow
y =Y

In [None]:
# SVM Kernel

from sklearn.svm import SVC
Cs = [0.001, 0.01, 0.1, 1,1.5,2,2.5,3,4,5, 10] ## penalty parameter C for the error term. 
gammas = [0.0001,0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)
grid_search = GridSearchCV(SVC(kernel = 'rbf', probability=True), param_grid, cv=cv) ## 'rbf' stands for gaussian kernel
grid_search.fit(X,y)

In [None]:
print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

In [None]:
svm_grid = grid_search.best_estimator_
svm_grid.score(X,y)

In [None]:
# from xgboost.sklearn import XGBClassifier
# from sklearn.model_selection import train_test_split

# model = XGBClassifier(learning_rate=0.001,n_estimators=2500,
#                                 max_depth=4, min_child_weight=0,
#                                 gamma=0, subsample=0.7,
#                                 colsample_bytree=0.7,
#                                 scale_pos_weight=1, seed=27,
#                                 reg_alpha=0.00006)
# model.fit(X_train_bow, Y)
# score = model.score(X_train_bow, Y)
# print("Val: LR:S: ", score)
# test_predict = model.predict(X_test_bow)
# print(len(test_predict))

In [None]:
# Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier
max_depth = range(1,30)
max_feature = [21,22,23,24,25,26,28,29,30,'auto']
criterion=["entropy", "gini"]

param = {'max_depth':max_depth, 
         'max_features':max_feature, 
         'criterion': criterion}
grid = GridSearchCV(DecisionTreeClassifier(), 
                                param_grid = param, 
                                 verbose=False, 
                                 cv=StratifiedKFold(n_splits=20, random_state=15, shuffle=True),
                                n_jobs = -1)
grid.fit(X, y) 