In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [2]:
imdb = pd.read_csv('imdb_labelled.txt', sep='\t',header=None)
amazon = pd.read_csv('amazon_cells_labelled.txt', sep='\t',header=None)
yelp = pd.read_csv('yelp_labelled.txt', sep = '\t', header = None)

In [3]:
imdb.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
imdb.shape

(748, 2)

In [5]:
amazon.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [6]:
amazon.shape

(1000, 2)

In [7]:
yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [8]:
yelp.shape

(1000, 2)

In [9]:
df = pd.DataFrame()

In [10]:
df = df.append(imdb)

In [11]:
df = df.append(amazon)

In [12]:
df = df.append(yelp)

In [13]:
df.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [14]:
df.shape

(2748, 2)

In [15]:
df.columns = ['Review', 'Sentiments']

In [16]:
df.head()

Unnamed: 0,Review,Sentiments
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [17]:
df_tokens  = []
for i in range (len(df)):
    df_tokens.append(word_tokenize(df['Review'].iloc[i].lower()))

In [18]:
print(df_tokens[0])

['a', 'very', ',', 'very', ',', 'very', 'slow-moving', ',', 'aimless', 'movie', 'about', 'a', 'distressed', ',', 'drifting', 'young', 'man', '.']


In [19]:
eng_stopwords = stopwords.words('english')

In [20]:
eng_stopwords.extend(['.','"',':','!','?'])

In [21]:
words = []
for list_1 in df_tokens:
    main_words = []
    for word in list_1:
        if word not in eng_stopwords:
            main_words.append(word)
    words.append(main_words)

In [22]:
print(words[0])

[',', ',', 'slow-moving', ',', 'aimless', 'movie', 'distressed', ',', 'drifting', 'young', 'man']


In [23]:
wnet = WordNetLemmatizer()

In [24]:
for i in range(len(words)):
    for j in range(len(words[i])):
        lemm = wnet.lemmatize(words[i][j], pos='v')
        words[i][j] = lemm

In [25]:
print(words[0])

[',', ',', 'slow-moving', ',', 'aimless', 'movie', 'distress', ',', 'drift', 'young', 'man']


In [26]:
for i in range(len(words)):
    words[i] = ' '.join(words[i])

In [27]:
tfidf = TfidfVectorizer()

In [28]:
vect = tfidf.fit_transform(words)

In [29]:
vect.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [54]:
x_train,x_test,y_train,y_test = train_test_split(vect,df['Sentiments'],test_size=0.10)

In [55]:
x_train.shape

(2473, 4372)

In [56]:
y_train.shape

(2473,)

In [57]:
reg = LogisticRegression()

In [58]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [59]:
y_pred = reg.predict(x_test)

In [60]:
accuracy_score(y_test, y_pred)

0.8072727272727273

In [61]:
confusion_matrix(y_test, y_pred)

array([[121,  17],
       [ 36, 101]], dtype=int64)