In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
nlp = spacy.blank('en')

In [4]:
test = pd.read_csv('/content/drive/MyDrive/sentiment/Test.csv')
train = pd.read_csv('/content/drive/MyDrive/sentiment/Train.csv')
valid = pd.read_csv('/content/drive/MyDrive/sentiment/Valid.csv')

In [5]:
test.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [6]:
y = test.append([train, valid], ignore_index= True)

In [7]:
y.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [8]:
y.shape

(50000, 2)

In [9]:
test.shape, train.shape, valid.shape

((5000, 2), (40000, 2), (5000, 2))

In [10]:
labels = y['label']
features = y['text']

In [11]:
labels

0        0
1        0
2        0
3        1
4        0
        ..
49995    1
49996    1
49997    1
49998    1
49999    1
Name: label, Length: 50000, dtype: int64

In [12]:
features

0        I always wrote this series off as being a comp...
1        1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...
2        This movie was so poorly written and directed ...
3        The most interesting thing about Miryang (Secr...
4        when i first read about "berlin am meer" i did...
                               ...                        
49995    Man, I loved this movie! This really takes me ...
49996    Recovery is an incredibly moving piece of work...
49997    You can take the crook out of the joint, but i...
49998    FUTZ is the only show preserved from the exper...
49999    "The Mother" tells of a recently widowed mid-6...
Name: text, Length: 50000, dtype: object

In [13]:
features[0]

'I always wrote this series off as being a complete stink-fest because Jim Belushi was involved in it, and heavily. But then one day a tragic happenstance occurred. After a White Sox game ended I realized that the remote was all the way on the other side of the room somehow. Now I could have just gotten up and walked across the room to get the remote, or even to the TV to turn the channel. But then why not just get up and walk across the country to watch TV in another state? "Nuts to that", I said. So I decided to just hang tight on the couch and take whatever Fate had in store for me. What Fate had in store was an episode of this show, an episode about which I remember very little except that I had once again made a very broad, general sweeping blanket judgment based on zero objective or experiential evidence with nothing whatsoever to back my opinions up with, and once again I was completely right! This show is a total crud-pie! Belushi has all the comedic delivery of a hairy lightho

In [14]:
features.value_counts()

Loved today's show!!! It was a variety and not solely cooking (which would have been great too). Very stimulating and captivating, always keeping the viewer peeking around the corner to see what was coming up next. She is as down to earth and as personable as you get, like one of us which made the show all the more enjoyable. Special guests, who are friends as well made for a nice surprise too. Loved the 'first' theme and that the audience was invited to play along too. I must admit I was shocked to see her come in under her time limits on a few things, but she did it and by golly I'll be writing those recipes down. Saving time in the kitchen means more time with family. Those who haven't tuned in yet, find out what channel and the time, I assure you that you won't be disappointed.                                                                                                                                                                                                                

In [15]:
labels.value_counts()

0    25000
1    25000
Name: label, dtype: int64

In [16]:
y.isnull().sum()

text     0
label    0
dtype: int64

In [22]:
logreg = LogisticRegression()

In [23]:
feature_extraction = TfidfVectorizer(stop_words= 'english')

In [24]:
X_train, X_text, y_train, y_test= train_test_split(features, labels, test_size= 0.2, random_state= 42)

In [25]:
X_train_features = feature_extraction.fit_transform(X_train)

In [26]:
X_test_features = feature_extraction.transform(X_text)

In [27]:
logreg.fit(X_train_features, y_train)

LogisticRegression()

In [28]:
X_train_predict = logreg.predict(X_train_features)

In [29]:
training_accuracy = accuracy_score(y_train, X_train_predict)

In [30]:
training_accuracy

0.93395

In [31]:
X_test_predict = logreg.predict(X_test_features)

In [32]:
testing_accuracy = accuracy_score(y_test, X_test_predict)

In [33]:
testing_accuracy

0.8944

In [34]:
print(classification_report(y_test, X_test_predict))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4958
           1       0.89      0.90      0.90      5042

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [35]:
print(confusion_matrix(y_test, X_test_predict))

[[4386  572]
 [ 484 4558]]
