In [2]:
import numpy as np
import pandas as pd
import scipy as sp
from tqdm import tqdm_notebook

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score

In [28]:
from nltk.corpus import movie_reviews
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Aleksey.Ilin\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aleksey.Ilin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aleksey.Ilin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [6]:
negfeats = [movie_reviews.words(fileids=[f]) for f in negids]
posfeats = [movie_reviews.words(fileids=[f]) for f in posids]

In [7]:
all_classes = [1]*len(posfeats) + [0]*len(negfeats)
all_reviews = posfeats + negfeats
reviews_better = [' '.join(l) for l in all_reviews]

# 1 q

In [8]:
pipe_co = Pipeline([('vectorizer', CountVectorizer()), ('lr', LogisticRegression())])
pipe_tf = Pipeline([('vectorizer', TfidfVectorizer()), ('lr', LogisticRegression())])

In [9]:
%%time
acc_cv_co = cross_val_score(X=reviews_better, y=all_classes, estimator=pipe_co, n_jobs=-1, scoring='accuracy', cv=5)
acc_cv_tf = cross_val_score(X=reviews_better, y=all_classes, estimator=pipe_tf, n_jobs=-1, scoring='accuracy', cv=5)

Wall time: 9.35 s


In [18]:
with open('1ans.txt', 'w') as f:
    f.write(' '.join(np.array([acc_cv_co.mean(), acc_cv_co.std(), acc_cv_tf.mean(), acc_cv_tf.std()]).astype(str)))

# 2 q

In [19]:
pipe_co_10 = Pipeline([('vectorizer', CountVectorizer(min_df=10)), ('lr', LogisticRegression())])
acc_cv_co_10 = cross_val_score(X=reviews_better, y=all_classes, estimator=pipe_co_10, n_jobs=-1, scoring='accuracy', cv=5)

pipe_co_50 = Pipeline([('vectorizer', CountVectorizer(min_df=50)), ('lr', LogisticRegression())])
acc_cv_co_50 = cross_val_score(X=reviews_better, y=all_classes, estimator=pipe_co_50, n_jobs=-1, scoring='accuracy', cv=5)

In [20]:
with open('2ans.txt', 'w') as f:
    f.write(' '.join(np.array([acc_cv_co_10.mean(), acc_cv_co_50.mean()]).astype(str)))

# 3 q

In [22]:
pipe_sgd = Pipeline([('vectorizer', CountVectorizer()), ('est', SGDClassifier(random_state=42))])
acc_cv_sgd = cross_val_score(X=reviews_better, y=all_classes, estimator=pipe_sgd, n_jobs=-1, scoring='accuracy', cv=5)

pipe_svm = Pipeline([('vectorizer', CountVectorizer(min_df=50)), ('est', LinearSVC())])
acc_cv_svm = cross_val_score(X=reviews_better, y=all_classes, estimator=pipe_svm, n_jobs=-1, scoring='accuracy', cv=5)

In [25]:
with open('3ans.txt', 'w') as f:
    f.write(str(min([acc_cv_sgd.mean(), acc_cv_svm.mean(), acc_cv_co.mean()])))

In [26]:
print([acc_cv_sgd.mean(), acc_cv_svm.mean(), acc_cv_co.mean()])

[0.756, 0.796, 0.841]


# 4 q

In [30]:
stop_nltk = nltk.corpus.stopwords.words('english')

In [35]:
pipe_co_nltk = Pipeline([('vectorizer', CountVectorizer(stop_words=stop_nltk)), ('lr', LogisticRegression())])
acc_cv_co_nltk = cross_val_score(X=reviews_better, y=all_classes, estimator=pipe_co_nltk, n_jobs=-1, 
                                 scoring='accuracy', cv=5)

pipe_co_built = Pipeline([('vectorizer', CountVectorizer(stop_words='english')), ('lr', LogisticRegression())])
acc_cv_co_built = cross_val_score(X=reviews_better, y=all_classes, estimator=pipe_co_built, n_jobs=-1, 
                                 scoring='accuracy', cv=5)

In [32]:
with open('4ans.txt', 'w') as f:
    f.write(' '.join(np.array([acc_cv_co_nltk.mean(), acc_cv_co_built.mean()]).astype(str)))

# 5 q

In [40]:
pipe_co_2gr = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))), ('lr', LogisticRegression())])
acc_cv_co_2gr = cross_val_score(X=reviews_better, y=all_classes, estimator=pipe_co_2gr, n_jobs=-1, 
                                 scoring='accuracy', cv=5)

pipe_co_35gr = Pipeline([('vectorizer', CountVectorizer(ngram_range=(3,5), analyzer='char_wb')), ('lr', LogisticRegression())])
acc_cv_co_35gr = cross_val_score(X=reviews_better, y=all_classes, estimator=pipe_co_35gr, n_jobs=-1, 
                                 scoring='accuracy', cv=5)

In [42]:
with open('5ans.txt', 'w') as f:
    f.write(' '.join(np.array([acc_cv_co_2gr.mean(), acc_cv_co_35gr.mean()]).astype(str)))