In [1]:
import numpy as np
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer

In [2]:
pos_review = pd.read_csv('pos.txt',encoding='latin-1',header=None,sep='\n')
pos_review['mood']=1
pos_review.rename(columns={0:'review'},inplace=True)
neg_review = pd.read_csv('negative.txt',encoding='latin-1',header=None,sep='\n')
neg_review['mood']=0
neg_review.rename(columns={0:'review'},inplace=True)

In [3]:
pos_review

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [4]:
neg_review



Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


In [5]:
pos_review['review'] = pos_review['review'].apply(lambda x:x.lower())
sw = stopwords.words('english')
pos_review['review'] = pos_review['review'].apply(lambda x:"".join([word for word in x.split() if word not in sw]))
pos_review['review'] = pos_review['review'].apply(lambda x:"".join([word for word in x.split() if word not in string.punctuation]))


In [6]:
neg_review['review'] = neg_review['review'].apply(lambda x:x.lower())
sw = stopwords.words('english')
neg_review['review'] = neg_review['review'].apply(lambda x:"".join([word for word in x.split() if word not in sw]))
neg_review['review'] = neg_review['review'].apply(lambda x:"".join([word for word in x.split() if word not in string.punctuation]))



In [7]:
all_data = pd.concat([pos_review,neg_review],axis=0).reset_index(drop=True)


In [29]:
X_train,X_test,y_train,y_test = train_test_split(all_data['review'].values,all_data['mood'].values,test_size=0.2,random_state=99)

In [30]:
train_data = pd.DataFrame({'review':X_train , 'mood':y_train})
test_data = pd.DataFrame({'review':X_test , 'mood':y_test})

In [31]:
train_data

Unnamed: 0,review,mood
0,deuceswildencyclopediaclichesshopliftsshameles...,0
1,"everentertainednotiontitlefilmimplies,sexstran...",0
2,"oedekerkwrotepatchadams,forgiven.givenfreereig...",0
3,"forgetmisleadingtitle,what'sunexplainedbaboonc...",0
4,[crystaldeniro]managesqueezegoodlaughsenoughma...,0
...,...,...
8524,director'stwitchysketchbookstyleadroitperspect...,0
8525,storyintelligenthighschoolstudentsdealsfirstlo...,1
8526,"insightsdreamworldteenlife,electronicexpressio...",0
8527,swallowabsurditiescruditieslagaanreallyenormou...,1


In [32]:
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['review'])
test_vectors = vectorizer.transform(test_data['review'])


In [33]:
test_vectors

<2133x22102 sparse matrix of type '<class 'numpy.float64'>'
	with 1039 stored elements in Compressed Sparse Row format>

In [34]:
from sklearn import svm
from sklearn.metrics import classification_report

classifier = svm.SVC()
classifier.fit(train_vectors,train_data['mood'])

SVC()

In [35]:
pred = classifier.predict(test_vectors)


In [36]:
report = classification_report(test_data['mood'],pred,output_dict=True)

In [37]:
report

{'0': {'precision': 0.6532258064516129,
  'recall': 0.2288135593220339,
  'f1-score': 0.3389121338912134,
  'support': 1062},
 '1': {'precision': 0.534923339011925,
  'recall': 0.8795518207282913,
  'f1-score': 0.6652542372881356,
  'support': 1071},
 'accuracy': 0.5555555555555556,
 'macro avg': {'precision': 0.5940745727317689,
  'recall': 0.5541826900251626,
  'f1-score': 0.5020831855896745,
  'support': 2133},
 'weighted avg': {'precision': 0.5938249894671283,
  'recall': 0.5555555555555556,
  'f1-score': 0.502771671039879,
  'support': 2133}}

In [38]:
print(f"Positive {report['1']['recall']}")
print(f"Negative {report['0']['recall']}")

Positive 0.8795518207282913
Negative 0.2288135593220339
