In [9]:
import nltk
from nltk.corpus import stopwords, movie_reviews
import sklearn
import random


documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

stop_words = stopwords.words()
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
all_words = [item for item in all_words if (len(item)>2 or item not in stop_words)]
word_features = list(all_words)[:2000] [1]

def document_features(document):
    words = [item for item in document if (len(item)>2 or item not in stop_words)]
    document_words = set(words)

    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set)) 

0.65


In [10]:

raws = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
random.shuffle(raws)

print(raws[0][1])



neg


In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
import re
from nltk.stem import WordNetLemmatizer

def decode_cat(cat):
    if 'pos' in cat:
        return 1
    else:
        return -1
x = [row[0] for row in raws]
y = [decode_cat(row[1]) for row in raws]

stop_words = stopwords.words('english')
stop_words.extend('film movie just time story character life'.split())
lemmatiser = WordNetLemmatizer()

def preprocess_doc(doc):
    clean = re.sub('[^0-9a-zA-Z]+', ' ', doc)
    words = clean.lower().split()
    result = []
    for word in words:
        afterl = lemmatiser.lemmatize(word)
        if (len(afterl)>3 and afterl not in stop_words):
            result.append(afterl)
    return result

    
clf = MultinomialNB()
countvec = CountVectorizer(analyzer=preprocess_doc, max_df=0.9, min_df=4)
MNB_clf = Pipeline([('vect', countvec),  ('tfidf', TfidfTransformer()), ('clf', clf), ])

scores = cross_val_score(MNB_clf, x, y)
print("Acc= %f" % scores.mean())

Acc= 0.818970


In [8]:
from sklearn.decomposition import LatentDirichletAllocation

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
countvec = CountVectorizer(analyzer=preprocess_doc, max_df=0.9, min_df=4)
x_after_vec = countvec.fit_transform(x)

lda = LatentDirichletAllocation(n_components=2)
lda.fit(x_after_vec)

print("\nTopics in LDA model:")
feature_names = countvec.get_feature_names()
print_top_words(lda, feature_names, 20)




Topics in LDA model:
Topic #0: like even scene good make would much first well action also plot really know thing year could little take star
Topic #1: like make scene even love good much performance also year would well take work play comedy come best role people



In [17]:
clf = SVC()
SVC_clf = Pipeline([('vect', CountVectorizer()),  ('tfidf', TfidfTransformer()), ('clf', clf), ])

scores = cross_val_score(SVC_clf, x, y)
print("Acc= %f" % scores.mean())

Acc= 0.742004


In [18]:
clf = RandomForestClassifier()
random_clf = Pipeline([('vect', CountVectorizer()),  ('tfidf', TfidfTransformer()), ('clf', clf), ])

scores = cross_val_score(random_clf, x, y)
print("Acc= %f" % scores.mean())

Acc= 0.647484


In [9]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier()
countvec = CountVectorizer(stop_words='english')
nn_clf = Pipeline([('vect', countvec),  ('tfidf', TfidfTransformer()), ('clf', clf), ])

scores = cross_val_score(nn_clf, x, y)
print("Acc= %f" % scores.mean())


Acc= 0.819510


# Movies Reviews
## CountVectorizer
- Using CountVectorizer and TF-IDF improve performance than using simple nltk example method and classifier
- But using n-grams make decrease accuracy
- Removing Stop words barely improve accuracy
## Classifiers
- Random Forest: Least accuracy but fast though => 0.65
- SVM classifier: Slow but better than random forest => 0.74
- AdaBoostClassifier: Slow and not very good accuracy => 0.75
- GradientBoostingClassifier: Slow but good accuracy => 0.78
- MultinomialNB and LogisticRegression: Best performance with high accuracy => 0.82
- DNN MLPClassifier: Slow but slighty higher accuracy => 0.82

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100)
countvec = CountVectorizer(stop_words='english')
gb_clf = Pipeline([('vect', countvec),  ('tfidf', TfidfTransformer()), ('clf', clf), ])

scores = cross_val_score(gb_clf, x, y)
print("Acc= %f" % scores.mean())


Acc= 0.781995


In [13]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100)
countvec = CountVectorizer(stop_words='english')
ad_clf = Pipeline([('vect', countvec),  ('tfidf', TfidfTransformer()), ('clf', clf), ])

scores = cross_val_score(ad_clf, x, y)
print("Acc= %f" % scores.mean())

Acc= 0.753000


In [24]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
countvec = CountVectorizer(analyzer=preprocess_doc, max_df=0.9, min_df=4)
ad_clf = Pipeline([('vect', countvec),  ('tfidf', TfidfTransformer()), ('clf', clf), ])

scores = cross_val_score(ad_clf, x, y)
print("Acc= %f" % scores.mean())

Acc= 0.844004


In [14]:
import numpy as np
import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

count_vect = CountVectorizer(analyzer=preprocess_doc, max_df=0.9)
x_train_counts = count_vect.fit_transform(x).toarray()

input_dim = np.max(x_train_counts) + 1
embedding_dims = 20
model = Sequential()
model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

x_train, x_test, y_train, y_test = train_test_split(x_train_counts, y, test_size=0.2)
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=25)
score = model.evaluate(x_test, y_test)
print(score)

Using TensorFlow backend.
  return f(*args, **kwds)


Train on 1600 samples, validate on 400 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
[0.69296298265457157, 0.5]
