In [4]:
### Step 1: Loading the data
###---------------------------

import os

def init_dict(a_dir):
    a_dict = {}
    file_list = os.listdir(a_dir)
    for a_file in file_list:
        f = open(a_dir + a_file, 'r')
        a_dict[a_file] = f.read()
        f.close()
    return a_dict

def print_dict(a_dict, maxn=5):
    for key in sorted(a_dict.keys()):
        print (key, ":\n" , a_dict.get(key))

pos = init_dict("data/review_polarity_tar/txt_sentoken/pos/")
neg = init_dict("data/review_polarity_tar/txt_sentoken/neg/")
#print_dict(pos)
#pos.items()

In [5]:
all_reviews = [(text, "pos") for (review, text) in pos.items()]
all_reviews += [(text, "neg") for (review, text) in neg.items()]
#all_reviews
#print(len(all_reviews)) #### 1000 pos and 1000 neg!!!!!!!!!!!
type(all_reviews)
#all_reviews

list

In [6]:
print(len(all_reviews)) #### 2000 

2000


In [7]:
#### TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(all_reviews, test_size=0.2)

In [10]:
### Step 2: Preprocessing the data
###--------------------------------

from spacy.lang.en.stop_words import STOP_WORDS as stopwords
import spacy
import string
punctuations = string.punctuation
nlp = spacy.load('en_core_web_sm')

# Modify the code to return only words of certain PoS
def spacy_tokenizer(text):
    tokens = nlp(text)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]
    return (' '.join(tokens))

In [9]:
train_targets = []
test_targets = []

train_data = [spacy_tokenizer(text) for (text, _) in train_set]
test_data  = [spacy_tokenizer(text) for (text, _) in test_set]

In [11]:
for (text, label) in train_set:
    if label=="pos": train_targets.append(1)
    else: train_targets.append(0)

for (text, label) in test_set:
    if label=="pos": test_targets.append(1)
    else: test_targets.append(0)

In [12]:
### Step 3: Extracting the features &
### Step 4: Training a classifier
###---------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())
                    ])

text_clf.fit(train_data, train_targets)
predicted = text_clf.predict(test_data)

In [13]:
# now you can look at the result and compare with the ground truth
for (text, pred, truth) in list(zip(test_data, predicted, test_targets))[:10]:
    print("{} => pred: {} <+> truth: {}".format(text[:75], pred, truth))

reviewer ignorant hand responsible deliver original version abyss 1989 medd => pred: 1 <+> truth: 1
surprise know joel ethan coen bring unabated lunacy movie screen raise ariz => pred: 1 <+> truth: 1
brother 's favorite movie h b halicki 's 1974 cult flick second good produc => pred: 0 <+> truth: 0
follow review encompass version dune dune theatrical version 1984 runtime 1 => pred: 0 <+> truth: 0
bad mimic definitly scar mimic continue frightening hollywood trend foreign => pred: 0 <+> truth: 0
sadly like movie year good bad omega code opening credit good feeling bad g => pred: 0 <+> truth: 0
oh behave felicity shagwell shagadellic babe horny female fembot breast req => pred: 0 <+> truth: 1
plot bunch kid haunt house play parody horror non horror movie oh yeah ghos => pred: 0 <+> truth: 0
disillusioned try find spice life richard leonardo dicaprio set thailand me => pred: 0 <+> truth: 0
movie deep religious spiritual undertone surprising find messenger story jo => pred: 1 <+> truth: 0


In [14]:
### Step 5: Evaluating the classifier
###---------------------------------
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

print(confusion_matrix(test_targets, predicted))
print(accuracy_score(test_targets, predicted))

[[171  19]
 [ 71 139]]
0.775


In [15]:
from sklearn.metrics import classification_report
print(classification_report(test_targets, predicted))

              precision    recall  f1-score   support

           0       0.71      0.90      0.79       190
           1       0.88      0.66      0.76       210

    accuracy                           0.78       400
   macro avg       0.79      0.78      0.77       400
weighted avg       0.80      0.78      0.77       400



In [16]:
#### Linear Support Vector Classification   
from sklearn.svm import LinearSVC

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC())])

# Train the classifier and predict
text_clf.fit(train_data, train_targets)  
predicted = text_clf.predict(test_data)

# Evaluate the performance
print(accuracy_score(test_targets, predicted))
print(confusion_matrix(test_targets, predicted))
print(classification_report(test_targets, predicted))

0.8275
[[166  24]
 [ 45 165]]
              precision    recall  f1-score   support

           0       0.79      0.87      0.83       190
           1       0.87      0.79      0.83       210

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400



In [17]:
####  linear classifiers with Stochastic Gradient Descent training. 

from sklearn.linear_model import SGDClassifier

# Initialise a pipeline with a classifier of your choice
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, max_iter=50, tol=1e-3,
                                           random_state=42,)),
])

# Train the classifier and predict
text_clf.fit(train_data, train_targets)  
predicted = text_clf.predict(test_data)

# Evaluate the performance
print(accuracy_score(test_targets, predicted))
print(confusion_matrix(test_targets, predicted))
print(classification_report(test_targets, predicted))

0.8175
[[161  29]
 [ 44 166]]
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       190
           1       0.85      0.79      0.82       210

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [18]:
#### grid search cv

from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              'clf__penalty': ('l1', 'l2')}

# apply a GridSearchCV on the pipeline
gs_clf = GridSearchCV(text_clf, parameters)
gs_clf = gs_clf.fit(train_data, train_targets)

predicted = gs_clf.predict(test_data)

# Evaluate the performance
print(accuracy_score(test_targets, predicted))
print(confusion_matrix(test_targets, predicted))
print(classification_report(test_targets, predicted))

for param_name in sorted(parameters.keys()):
    print("{}: {}".format(param_name, gs_clf.best_params_[param_name]))

0.805
[[152  38]
 [ 40 170]]
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       190
           1       0.82      0.81      0.81       210

    accuracy                           0.81       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.81      0.81      0.81       400

clf__alpha: 0.001
clf__penalty: l2
tfidf__use_idf: False
vect__ngram_range: (1, 1)
