In [232]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report

## NLP with scikit - learn

In [233]:
with open("C:/Users/as/Desktop/NLP project/NLP-Project/amazon_cells_labelled.txt") as file:
    amazon_reviews = file.readlines()

In [234]:
amazon_reviews = [review.strip() for review in amazon_reviews]


In [235]:
amazon_reviews = [review.rsplit("\t",1)for review in amazon_reviews]


In [236]:
amazon_review_df = pd.DataFrame.from_records(amazon_reviews)
amazon_review_df.columns = ['comment','sentiment']

In [237]:
amazon_review_df['sentiment'] = amazon_review_df['sentiment'].astype(int)

In [238]:
amazon_review_df.dtypes

comment      object
sentiment     int32
dtype: object

In [239]:
amazon_review_df.shape

(1000, 2)

In [240]:
amazon_review_df

Unnamed: 0,comment,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
...,...,...
995,The screen does get smudged easily because it ...,0
996,What a piece of junk.. I lose more calls on th...,0
997,Item Does Not Match Picture.,0
998,The only thing that disappoint me is the infra...,0


In [241]:
attributes = amazon_review_df.comment
labels = amazon_review_df.sentiment

In [242]:
attributes_train, attributes_test,labels_train,labels_test =  train_test_split(
    attributes,
    labels,
    random_state = 33,
     test_size = 200,
     stratify = labels)

In [243]:
count_vectorizer = CountVectorizer(
    stop_words="english"
)

In [244]:
count_matrix = count_vectorizer.fit_transform(attributes_train)

In [245]:
count_matrix.toarray().shape

(800, 1429)

In [246]:
count_vectorizer.vocabulary_

{'disappointment': 361,
 'hate': 590,
 'goes': 560,
 'ear': 400,
 'working': 1412,
 'great': 569,
 'good': 563,
 'quality': 998,
 'bargain': 103,
 'bought': 142,
 'cheapy': 216,
 'big': 122,
 'lots': 760,
 'sounded': 1178,
 'awful': 96,
 'people': 912,
 'end': 428,
 'couldn': 292,
 'hear': 599,
 'phone': 920,
 'nice': 843,
 'sound': 1177,
 'doesn': 377,
 'hold': 611,
 'charge': 207,
 'don': 379,
 'think': 1260,
 'securly': 1105,
 'belt': 117,
 'look': 750,
 'sharp': 1125,
 'screen': 1097,
 'clear': 226,
 'graphics': 568,
 'didn': 349,
 'want': 1368,
 'clip': 231,
 'going': 561,
 'causing': 197,
 'discomfort': 363,
 'falls': 477,
 'easily': 415,
 'completely': 259,
 'secure': 1103,
 'holding': 613,
 'keeping': 696,
 'iphone': 676,
 'inside': 662,
 'earpiece': 409,
 'large': 716,
 'heavy': 600,
 'keeps': 697,
 'falling': 476,
 'arrived': 79,
 'quickly': 1000,
 'expensive': 461,
 'sold': 1169,
 'joy': 691,
 'use': 1334,
 'reaching': 1013,
 'row': 1078,
 'uncomfortable': 1314,
 'send': 110

In [247]:
np.save("count_matrix.csv",count_matrix.toarray())

 TF -IDF 


In [248]:
tfidf = TfidfVectorizer(stop_words="english")

In [249]:
tfidf_matrix = tfidf.fit_transform(attributes_train)

In [250]:
tfidf.vocabulary_

{'disappointment': 361,
 'hate': 590,
 'goes': 560,
 'ear': 400,
 'working': 1412,
 'great': 569,
 'good': 563,
 'quality': 998,
 'bargain': 103,
 'bought': 142,
 'cheapy': 216,
 'big': 122,
 'lots': 760,
 'sounded': 1178,
 'awful': 96,
 'people': 912,
 'end': 428,
 'couldn': 292,
 'hear': 599,
 'phone': 920,
 'nice': 843,
 'sound': 1177,
 'doesn': 377,
 'hold': 611,
 'charge': 207,
 'don': 379,
 'think': 1260,
 'securly': 1105,
 'belt': 117,
 'look': 750,
 'sharp': 1125,
 'screen': 1097,
 'clear': 226,
 'graphics': 568,
 'didn': 349,
 'want': 1368,
 'clip': 231,
 'going': 561,
 'causing': 197,
 'discomfort': 363,
 'falls': 477,
 'easily': 415,
 'completely': 259,
 'secure': 1103,
 'holding': 613,
 'keeping': 696,
 'iphone': 676,
 'inside': 662,
 'earpiece': 409,
 'large': 716,
 'heavy': 600,
 'keeps': 697,
 'falling': 476,
 'arrived': 79,
 'quickly': 1000,
 'expensive': 461,
 'sold': 1169,
 'joy': 691,
 'use': 1334,
 'reaching': 1013,
 'row': 1078,
 'uncomfortable': 1314,
 'send': 110

# Naive Bayes
naive bayes can work with sparse matrix

In [251]:
bayes = MultinomialNB()
bayes.fit(tfidf_matrix, labels_train)

MultinomialNB()

In [252]:
bayes.score(tfidf.transform(attributes_train),labels_train)

0.96875

In [253]:
bayes.score(tfidf.transform(attributes_test),labels_test)

0.795

In [254]:
train_pred = bayes.predict(tfidf.transform(attributes_train))
test_pred = bayes.predict(tfidf.transform(attributes_test))

Overfit because more words than records (lines)

In [255]:
print(classification_report(labels_train,train_pred))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97       400
           1       0.95      0.99      0.97       400

    accuracy                           0.97       800
   macro avg       0.97      0.97      0.97       800
weighted avg       0.97      0.97      0.97       800



In [256]:
print(classification_report(labels_test,test_pred))

              precision    recall  f1-score   support

           0       0.86      0.71      0.78       100
           1       0.75      0.88      0.81       100

    accuracy                           0.80       200
   macro avg       0.80      0.79      0.79       200
weighted avg       0.80      0.80      0.79       200



## tfidf, round 2

In [257]:
tfidf = TfidfVectorizer(
    stop_words="english",
    min_df = 2,
    max_df = 0.9
)


In [258]:
tfidf.fit_transform(attributes_train)

<800x512 sparse matrix of type '<class 'numpy.float64'>'
	with 2846 stored elements in Compressed Sparse Row format>

In [259]:
bayes.fit(tfidf.transform(attributes_train), labels_train)

MultinomialNB()

In [260]:
train_pred = bayes.predict(tfidf.transform(attributes_train))
test_pred = bayes.predict(tfidf.transform(attributes_test))

In [261]:
print(classification_report(labels_train,train_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       400
           1       0.91      0.93      0.92       400

    accuracy                           0.92       800
   macro avg       0.92      0.92      0.92       800
weighted avg       0.92      0.92      0.92       800



In [262]:
print(classification_report(labels_test,test_pred))

              precision    recall  f1-score   support

           0       0.82      0.76      0.79       100
           1       0.78      0.83      0.80       100

    accuracy                           0.80       200
   macro avg       0.80      0.79      0.79       200
weighted avg       0.80      0.80      0.79       200



In [263]:
linear_svc = LinearSVC()
linear_svc.fit(tfidf.transform(attributes_train), labels_train)

LinearSVC()

In [264]:
train_pred = linear_svc.predict(tfidf.transform(attributes_train))
test_pred = linear_svc.predict(tfidf.transform(attributes_test))
print(classification_report(labels_train,train_pred))
print(classification_report(labels_test,test_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       400
           1       0.97      0.97      0.97       400

    accuracy                           0.97       800
   macro avg       0.97      0.97      0.97       800
weighted avg       0.97      0.97      0.97       800

              precision    recall  f1-score   support

           0       0.81      0.84      0.82       100
           1       0.83      0.80      0.82       100

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.82      0.82      0.82       200



# SGDClassifier

In [265]:
sgd = SGDClassifier(
    learning_rate= 'optimal',
    early_stopping= True
)
# incremental learning

In [266]:
sgd.fit(tfidf.transform(attributes_train), labels_train)

SGDClassifier(early_stopping=True)

In [267]:
train_pred = sgd.predict(tfidf.transform(attributes_train))
test_pred = sgd.predict(tfidf.transform(attributes_test))
print(classification_report(labels_train,train_pred))
print(classification_report(labels_test,test_pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       400
           1       0.93      0.96      0.95       400

    accuracy                           0.94       800
   macro avg       0.95      0.95      0.94       800
weighted avg       0.95      0.94      0.94       800

              precision    recall  f1-score   support

           0       0.81      0.73      0.77       100
           1       0.75      0.83      0.79       100

    accuracy                           0.78       200
   macro avg       0.78      0.78      0.78       200
weighted avg       0.78      0.78      0.78       200



In [268]:
sgd2 = SGDClassifier()

In [269]:
sgd2.partial_fit(tfidf.transform(attributes_train)[:20], labels_train[:20],classes = [0,1])

SGDClassifier()

In [270]:
sgd2.coef_

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  7.00824155,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  6.83273967,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  4.21325842,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  5.22728081,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        , -6.71187308,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        , -5.54825039,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -4.85861915,  0.        ,  0.        ,  0. 

In [271]:
mini_batches = np.array_split(attributes_train,len(attributes_train)/16)

In [272]:
mini_batches

[472    Disappointment.. I hate anything that goes in ...
 306                              Has been working great.
 797    A good quality bargain.. I bought this after I...
 792                                         Great Phone.
 666                                          Nice Sound.
 29                                  Doesn't hold charge.
 784    I don't think it would hold it too securly on ...
 167    The look of it is very sharp and the screen is...
 886    I didn't want the clip going over the top of m...
 987                              Phone falls out easily.
 879    It seems completely secure, both holding on to...
 652    The earpiece on this is too large or too heavy...
 733    Arrived quickly and much less expensive than o...
 950                                  It is a joy to use.
 838    Reaching for the bottom row is uncomfortable, ...
 842     Terrible.. My car will not accept this cassette.
 Name: comment, dtype: object,
 188                                   Wa

In [274]:
# for mini_batch in mini_batches:
#     sgd2.partial_fit(mini_batch,mini_batch)