In [1]:
from nltk.corpus import movie_reviews
import nltk

In [None]:
nltk.download('stopwords')

In [2]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [3]:
negfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]

In [4]:
texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.pipeline import Pipeline

import numpy as np

In [6]:
FOLDS = 5

In [7]:
def get_pipeline(vectorizer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("classifier", classifier)]
        )

In [8]:
mean_cnt_vec = cross_val_score(
    get_pipeline(CountVectorizer(), LogisticRegression()),
    texts, labels, cv = FOLDS).mean()

std_cnt_vec = cross_val_score(
    get_pipeline(CountVectorizer(), LogisticRegression()),
    texts, labels, cv = FOLDS).std()

In [9]:
mean_tfidf_vec = cross_val_score(
    get_pipeline(TfidfVectorizer(), LogisticRegression()),
    texts, labels, cv = FOLDS).mean()

std_tfidf_vec = cross_val_score(
    get_pipeline(TfidfVectorizer(), LogisticRegression()),
    texts, labels, cv = FOLDS).std()

In [10]:
l = [mean_cnt_vec, std_cnt_vec, mean_tfidf_vec, std_tfidf_vec]

print (l)

with open("output/conf_model_answer1.txt", "w") as f:
    f.write(" ".join(map(str, l)))

[0.841, 0.01677796173556255, 0.8210000000000001, 0.004062019202317978]


In [11]:
mean_mindf_10 = cross_val_score(
    get_pipeline(CountVectorizer(min_df=10), LogisticRegression()),
    texts, labels, cv = FOLDS).mean()

mean_mindf_50 = cross_val_score(
    get_pipeline(CountVectorizer(min_df=50), LogisticRegression()),
    texts, labels, cv = FOLDS).mean()

In [12]:
l = [mean_mindf_10, mean_mindf_50]

print (l)

with open("output/conf_model_answer2.txt", "w") as f:
    f.write(" ".join(map(str, l)))

[0.8390000000000001, 0.813]


In [13]:
diff_classifiers = [LogisticRegression, LinearSVC, SGDClassifier] 

diff_classifiers_estim = [cross_val_score(get_pipeline(CountVectorizer(), clf()), texts, labels, cv = FOLDS).mean() for clf in diff_classifiers]



In [14]:
print (diff_classifiers_estim)
print (min(diff_classifiers_estim))

with open("output/conf_model_answer3.txt", "w") as f:
    f.write(str(min(diff_classifiers_estim)))

[0.841, 0.8325000000000001, 0.766]
0.766


In [15]:
nltk_stop_words = nltk.corpus.stopwords.words('english')

In [16]:
mean_nltk_sw = cross_val_score(
    get_pipeline(CountVectorizer(stop_words=nltk_stop_words), LogisticRegression()),
    texts, labels, cv = FOLDS).mean()

mean_sklearn_sw = cross_val_score(
    get_pipeline(CountVectorizer(stop_words="english"), LogisticRegression()),
    texts, labels, cv = FOLDS).mean()

In [17]:
l = [mean_nltk_sw, mean_sklearn_sw]

print (l)

with open("output/conf_model_answer4.txt", "w") as f:
    f.write(" ".join(map(str, l)))

[0.841, 0.8390000000000001]


In [18]:
mean_bigram = cross_val_score(
    get_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression()),
    texts, labels, cv = FOLDS).mean()

mean_35wb = cross_val_score(
    get_pipeline(CountVectorizer(ngram_range=(3,5), analyzer="char_wb"), LogisticRegression()),
    texts, labels, cv = FOLDS).mean()

In [19]:
l = [mean_bigram, mean_35wb]

print (l)

with open("output/conf_model_answer5.txt", "w") as f:
    f.write(" ".join(map(str, l)))

[0.8525, 0.8205]
