In [22]:
import pandas as pd
import numpy as np

# Data loading

In [2]:
fn='./data/rt-polarity.neg'

with open(fn, "r",encoding='utf-8', errors='ignore') as f: # some invalid symbols encountered
    content = f.read()
texts_neg =  content.splitlines()

print ('len of texts_neg = {:,}'.format (len(texts_neg)))
for review in texts_neg[:5]:
    print ( '\n', review)

len of texts_neg = 5,331

 simplistic , silly and tedious . 

 it's so laddish and juvenile , only teenage boys could possibly find it funny . 

 exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 

 [garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 

 a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 


In [7]:
fn='./data/rt-polarity.pos'

with open(fn, "r",encoding='utf-8', errors='ignore') as f:
    content = f.read()
texts_pos =  content.splitlines()

print ('len of texts_pos = {:,}'.format (len(texts_pos)))
for review in texts_pos[:5]:
    print ('\n', review)

len of texts_pos = 5,331

 the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

 the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 

 effective but too-tepid biopic

 if you sometimes like to go to the movies to have fun , wasabi is a good place to start . 

 emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . 


# Creating dataframe

In [10]:
df_neg = pd.DataFrame(texts_neg, columns=['reviews'])
df_neg['rating'] = 0
df_neg.head()

Unnamed: 0,reviews,rating
0,"simplistic , silly and tedious .",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


In [12]:
df_pos = pd.DataFrame(texts_pos, columns=['reviews'])
df_pos['rating'] = 1
df_pos.head()

Unnamed: 0,reviews,rating
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [14]:
df = pd.concat([df_pos, df_neg])
df[::1000]

Unnamed: 0,reviews,rating
0,the rock is destined to be the 21st century's ...,1
1000,the weakest of the four harry potter books has...,1
2000,a captivatingly quirky hybrid of character por...,1
3000,"expect no major discoveries , nor any stylish ...",1
4000,directed with purpose and finesse by england's...,1
5000,"affable if not timeless , like mike raises som...",1
669,"as the movie dragged on , i thought i heard a ...",0
1669,"at its best , queen is campy fun like the vinc...",0
2669,don't let your festive spirit go this far .,0
3669,"every visual joke is milked , every set-up obv...",0


# Split to train and test sets

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['reviews'],df['rating'],random_state=42)

# Logistic Regression

In [56]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(stop_words='english').fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

clf = LogisticRegression().fit(X_train_vectorized, y_train)

In [63]:
predictions = clf.predict(X_test_vectorized) 
print('f1: ', f1_score(y_test, predictions))

scores = clf.decision_function(X_test_vectorized)
print('AUC: ', roc_auc_score(y_test, scores))

f1:  0.7479182437547313
AUC:  0.8240452299063085


# Logistic regression with Word2Vec

In [34]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hello\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Hello\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [48]:
X_train_tokenized = [word_tokenize(text) for text in X_train]
X_test_tokenized = [word_tokenize(text) for text in X_test]

w2v_model = Word2Vec(X_train_tokenized,
                    vector_size=50,     # Dimensionality of word vectors
                    window=5,            # Context window size
                    min_count=2,         # Ignore words with frequency below this
                    workers=4)           # Number of threads

In [49]:
def document_vector(doc, model):
    # Remove out-of-vocabulary words
    vector = [model.wv[word] for word in doc if word in model.wv]
    if len(vector) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vector, axis=0)

X_train_w2v = np.array([document_vector(doc, w2v_model) for doc in X_train_tokenized])
X_test_w2v = np.array([document_vector(doc, w2v_model) for doc in X_test_tokenized])

In [50]:
clf = LogisticRegression().fit(X_train_w2v, y_train)

predictions = clf.predict(X_test_w2v) 
print('f1: ', f1_score(y_test, predictions))

scores = clf.decision_function(X_test_w2v)
print('AUC: ', roc_auc_score(y_test, scores))

f1:  0.6136849607982894
AUC:  0.6263878265792955


# Zero-Shot Classification

In [19]:
from transformers import pipeline

zero_shot_classifier = pipeline("zero-shot-classification",
                                model="facebook/bart-large-mnli",
                                batch_size=50)

candidate_labels = ["positive", "negative"]
predictions = []
scores = []

for review in X_test:
    result = zero_shot_classifier(review, candidate_labels)
    predictions.append(result['labels'][0])
    scores.append(result['scores'][result['labels'].index("positive")])

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [24]:
label_map = {"positive": 1, "negative": 0}
num_predictions = [label_map[pred] for pred in predictions]

print('f1: ', f1_score(y_test, num_predictions))
print('AUC: ', roc_auc_score(y_test, scores))

f1:  0.7913786091907279
AUC:  0.8936649202578358


# Results
So in conclusion, zero_shot_classifier gave relatively the best results and Logistic regression with Word2Vec gave the worst. But zero_shot_classifier took a lot of time and its results not much grater than from standart logistic regression. Word2Vec worked pretty bad, maybe because logistic regression can't properly separate dense vectors, so perhaps I shoud have choose a different classifying method