In [1]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
imdb = pd.read_csv("SentimentAnalysis/imdb_labelled.txt",sep='\t',header=None)
amazon = pd.read_csv("SentimentAnalysis/amazon_cells_labelled.txt",sep='\t',header=None)
yelp = pd.read_csv("SentimentAnalysis/yelp_labelled.txt",sep='\t',header=None)

In [3]:
df = pd.DataFrame()

In [4]:
df = pd.concat([imdb,amazon,yelp],ignore_index=True)

In [5]:
df.columns = ['Review','Sentiment']

In [6]:
print(df.head())

                                              Review  Sentiment
0  A very, very, very slow-moving, aimless movie ...          0
1  Not sure who was more lost - the flat characte...          0
2  Attempting artiness with black & white and cle...          0
3       Very little music or anything to speak of.            0
4  The best scene in the movie was when Gerardo i...          1


In [7]:
tokenList = []
for i in range(df.shape[0]):
    tokenList.append(word_tokenize(df['Review'].iloc[i]))

In [8]:
print(tokenList[:2])

[['A', 'very', ',', 'very', ',', 'very', 'slow-moving', ',', 'aimless', 'movie', 'about', 'a', 'distressed', ',', 'drifting', 'young', 'man', '.'], ['Not', 'sure', 'who', 'was', 'more', 'lost', '-', 'the', 'flat', 'characters', 'or', 'the', 'audience', ',', 'nearly', 'half', 'of', 'whom', 'walked', 'out', '.']]


In [9]:
stopwords_list = stopwords.words('english')
stopwords_list.extend([',','.','-','!',"(",")"])

In [10]:
wordsList = []
for tokens in tokenList:
    word_temp = []
    for word in tokens:
        if word.lower() not in stopwords_list:
            word_temp.append(word.lower())
    wordsList.append(word_temp)

In [11]:
print(wordsList[:2])

[['slow-moving', 'aimless', 'movie', 'distressed', 'drifting', 'young', 'man'], ['sure', 'lost', 'flat', 'characters', 'audience', 'nearly', 'half', 'walked']]


In [12]:
wnet = WordNetLemmatizer()

In [13]:
for i in range(len(wordsList)):
    for j in range(len(wordsList[i])):
        wordsList[i][j] = wnet.lemmatize(wordsList[i][j],pos='v')

In [14]:
print(wordsList[:2])

[['slow-moving', 'aimless', 'movie', 'distress', 'drift', 'young', 'man'], ['sure', 'lose', 'flat', 'character', 'audience', 'nearly', 'half', 'walk']]


In [15]:
wordsList = np.asarray(wordsList)

In [16]:
for i in range(len(wordsList)):
    wordsList[i] = ' '.join(wordsList[i])

In [17]:
print(wordsList[:2])

['slow-moving aimless movie distress drift young man'
 'sure lose flat character audience nearly half walk']


In [18]:
cv = CountVectorizer()       

In [19]:
vector = cv.fit_transform(wordsList)

In [20]:
reg = LogisticRegression()

In [21]:
y = df['Sentiment'].values

In [22]:
xtrain,xtest,ytrain,ytest = train_test_split(vector,y,test_size=0.25)

In [23]:
reg.fit(xtrain,ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
ypred = reg.predict(xtest)

In [25]:
print(accuracy_score(ypred,ytest))

0.7918486171761281


In [26]:
print(confusion_matrix(ypred,ytest))

[[279  72]
 [ 71 265]]


In [27]:
rev = """Friendly atmosphere all over
Great growth opportunities ,u grow with the company.
Great managers and team members.
Always supportive in all phase of works"""

In [28]:
tokens = word_tokenize(rev)

In [31]:
wordsL = []
for word in tokens:
    if word.lower() not in stopwords_list:
        wordsL.append(word.lower())

In [32]:
for i in range(len(wordsL)):
    wordsL[i] = wnet.lemmatize(wordsL[i],pos='v')

In [33]:
print(wordsL)

['friendly', 'atmosphere', 'great', 'growth', 'opportunities', 'u', 'grow', 'company', 'great', 'managers', 'team', 'members', 'always', 'supportive', 'phase', 'work']


In [34]:
sent = []

In [40]:
sent = ' '.join(wordsL)

In [46]:
print([sent])

['friendly atmosphere great growth opportunities u grow company great managers team members always supportive phase work']


In [42]:
vect = cv.transform([sent])

In [44]:
senti = reg.predict(vect)

In [47]:
print(senti)

[1]
