In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
imdb = pd.read_csv("SentimentAnalysis/imdb_labelled.txt",sep='\t',header=None)
amazon = pd.read_csv("SentimentAnalysis/amazon_cells_labelled.txt",sep='\t',header=None)
yelp = pd.read_csv("SentimentAnalysis/yelp_labelled.txt",sep='\t',header=None)

In [3]:
print(imdb.head())

                                                   0  1
0  A very, very, very slow-moving, aimless movie ...  0
1  Not sure who was more lost - the flat characte...  0
2  Attempting artiness with black & white and cle...  0
3       Very little music or anything to speak of.    0
4  The best scene in the movie was when Gerardo i...  1


In [4]:
print(imdb.shape)

(748, 2)


In [5]:
print(amazon.shape)

(1000, 2)


In [6]:
print(yelp.shape)

(1000, 2)


In [7]:
df = pd.DataFrame()

In [8]:
df = pd.concat([imdb,amazon,yelp],ignore_index=True)

In [9]:
df.columns = ['Review','Sentiment']

In [10]:
print(df.head())

                                              Review  Sentiment
0  A very, very, very slow-moving, aimless movie ...          0
1  Not sure who was more lost - the flat characte...          0
2  Attempting artiness with black & white and cle...          0
3       Very little music or anything to speak of.            0
4  The best scene in the movie was when Gerardo i...          1


In [11]:
print(df.shape)

(2748, 2)


In [12]:
print(df.iloc[0])

Review       A very, very, very slow-moving, aimless movie ...
Sentiment                                                    0
Name: 0, dtype: object


In [13]:
print(df.loc[0])

Review       A very, very, very slow-moving, aimless movie ...
Sentiment                                                    0
Name: 0, dtype: object


# Tokenization

In [15]:
tokens = []
for i in range(df.shape[0]):
    tokens.append(word_tokenize(df['Review'].iloc[i]))

In [16]:
print(tokens[:5])

[['A', 'very', ',', 'very', ',', 'very', 'slow-moving', ',', 'aimless', 'movie', 'about', 'a', 'distressed', ',', 'drifting', 'young', 'man', '.'], ['Not', 'sure', 'who', 'was', 'more', 'lost', '-', 'the', 'flat', 'characters', 'or', 'the', 'audience', ',', 'nearly', 'half', 'of', 'whom', 'walked', 'out', '.'], ['Attempting', 'artiness', 'with', 'black', '&', 'white', 'and', 'clever', 'camera', 'angles', ',', 'the', 'movie', 'disappointed', '-', 'became', 'even', 'more', 'ridiculous', '-', 'as', 'the', 'acting', 'was', 'poor', 'and', 'the', 'plot', 'and', 'lines', 'almost', 'non-existent', '.'], ['Very', 'little', 'music', 'or', 'anything', 'to', 'speak', 'of', '.'], ['The', 'best', 'scene', 'in', 'the', 'movie', 'was', 'when', 'Gerardo', 'is', 'trying', 'to', 'find', 'a', 'song', 'that', 'keeps', 'running', 'through', 'his', 'head', '.']]


# Removal of Stopwords

In [17]:
stopwords_list = stopwords.words('english')
stopwords_list.extend(['.',',','-','!'])

In [18]:
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
wordsList = []
for words in tokens:
    words_temp = []
    for word in words:
        if word.lower() not in stopwords_list:
            words_temp.append(word.lower())
    wordsList.append(words_temp)
del words_temp

In [20]:
print(wordsList[:5])

[['slow-moving', 'aimless', 'movie', 'distressed', 'drifting', 'young', 'man'], ['sure', 'lost', 'flat', 'characters', 'audience', 'nearly', 'half', 'walked'], ['attempting', 'artiness', 'black', '&', 'white', 'clever', 'camera', 'angles', 'movie', 'disappointed', 'became', 'even', 'ridiculous', 'acting', 'poor', 'plot', 'lines', 'almost', 'non-existent'], ['little', 'music', 'anything', 'speak'], ['best', 'scene', 'movie', 'gerardo', 'trying', 'find', 'song', 'keeps', 'running', 'head']]


# Stemming and Lemmatization

In [21]:
wnet = WordNetLemmatizer()

In [22]:
for i in range(len(wordsList)):
    wordsLem = []
    for j in range(len(wordsList[i])):
        wordsLem.append(wnet.lemmatize(wordsList[i][j],pos='v'))
    wordsList[i] = wordsLem

In [23]:
print(wordsList[:5])

[['slow-moving', 'aimless', 'movie', 'distress', 'drift', 'young', 'man'], ['sure', 'lose', 'flat', 'character', 'audience', 'nearly', 'half', 'walk'], ['attempt', 'artiness', 'black', '&', 'white', 'clever', 'camera', 'angle', 'movie', 'disappoint', 'become', 'even', 'ridiculous', 'act', 'poor', 'plot', 'line', 'almost', 'non-existent'], ['little', 'music', 'anything', 'speak'], ['best', 'scene', 'movie', 'gerardo', 'try', 'find', 'song', 'keep', 'run', 'head']]


# Vectorization 

In [24]:
cv = CountVectorizer()

In [25]:
wordsList = np.asarray(wordsList)

In [26]:
for i in range(len(wordsList)):
    wordsList[i] = ' '.join(wordsList[i])

In [27]:
print(wordsList[1])

sure lose flat character audience nearly half walk


In [28]:
vect = cv.fit_transform(wordsList)

In [29]:
print(vect[:2,:100].toarray())

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


# Train Test split

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
y = df['Sentiment'].values

In [32]:
xtrain,xtest,ytrain,ytest = train_test_split(vect,y,test_size=0.25)

# Logistic Regression

In [33]:
reg = LogisticRegression()

In [34]:
reg.fit(xtrain,ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [35]:
ypred = reg.predict(xtest)

In [36]:
print(ypred[:9])

[1 0 1 0 0 1 0 1 0]


In [37]:
confusion_matrix(ypred,ytest)

array([[283,  70],
       [ 58, 276]], dtype=int64)

In [38]:
accuracy_score(ypred,ytest)

0.8136826783114993

# Testing A Review

In [80]:
rev = """Friendly atmosphere all over
Great growth opportunities ,u grow with the company.
Great managers and team members.
Always supportive in all phase of works"""

In [81]:
tokens = word_tokenize(rev)

In [82]:
print(tokens)

['Friendly', 'atmosphere', 'all', 'over', 'Great', 'growth', 'opportunities', ',', 'u', 'grow', 'with', 'the', 'company', '.', 'Great', 'managers', 'and', 'team', 'members', '.', 'Always', 'supportive', 'in', 'all', 'phase', 'of', 'works']


In [83]:
wordsList = []
for word in tokens:
    if word.lower() not in stopwords_list:
        wordsList.append(word.lower())
        

In [84]:
print(wordsList)

['friendly', 'atmosphere', 'great', 'growth', 'opportunities', 'u', 'grow', 'company', 'great', 'managers', 'team', 'members', 'always', 'supportive', 'phase', 'works']


In [85]:
for i in range(len(wordsList)):
    wordsList[i] = wnet.lemmatize(wordsList[i],pos='v')

In [86]:
print(wordsList)

['friendly', 'atmosphere', 'great', 'growth', 'opportunities', 'u', 'grow', 'company', 'great', 'managers', 'team', 'members', 'always', 'supportive', 'phase', 'work']


In [87]:
sent = ' '.join(wordsList)

In [88]:
vect = cv.transform([sent])

In [89]:
senti = reg.predict(vect)

In [90]:
print(senti)

[1]
