In [3]:
import pandas as pd
import re
from bs4 import BeautifulSoup

In [None]:
train = pd.read_csv('labeledTrainData.tsv/labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('testData.tsv/testData.tsv', delimiter='\t')

In [None]:
train.head()
test.head()

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

**Preprocessing**

In [None]:
def review_to_text(review, remove_stopwords):
    raw = BeautifulSoup(review, 'html').get_text()
    letters = re.sub('[^a-zA-Z]',' ', raw)
    words = letters.lower().split()
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
    return words

In [None]:
X_train = []
for review in train['review']:
    X_train.append(' '.join(review_to_text(review, True)))

In [None]:
X_test = []
for review in test['review']:
    X_test.append(' '.join(review_to_text(review, True)))


In [None]:
y_train = train['sentiment']

**Training using CountVectorizer and TfidfVectorizer**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
pip_count = Pipeline([('count_vec', CountVectorizer(analyzer='word')), ('mnb', MultinomialNB())])
pip_tf = Pipeline([('tf_vec', TfidfVectorizer(analyzer='word')), ('mnb', MultinomialNB())])

In [None]:
params_count = {'count_vec__binary':[True, False], 'count_vec__ngram_range':[(1,1),(1,2)],'mnb__alpha':[0.1,1.0,10.0]}
params_tf = {'tf_vec__binary':[True, False], 'tf_vec__ngram_range':[(1,1),(1,2)],'mnb__alpha':[0.1,1.0,10.0]}

In [None]:
gs_count = GridSearchCV(pip_count, params_count, cv =4, n_jobs=-1, verbose=1)
gs_count.fit(X_train, y_train)

In [None]:
gs_count.best_score_

In [None]:
gs_tfidf = GridSearchCV(pip_tf, params_tf, cv=4, verbose=1,n_jobs=-1)
gs_tfidf.fit(X_train, y_train)

In [None]:
gs_tfidf.best_score_

In [None]:
y_pred_count = gs_count.predict(X_test)
y_pred_tfidf = gs_tfidf.predict(X_test)

In [None]:
count_submission = pd.DataFrame({'id':test['id'], 'sentiment': y_pred_count})
tfidf_submission = pd.DataFrame({'id':test['id'], 'sentiment': y_pred_tfidf})

In [None]:
count_submission.to_csv('count_submission.csv', index=False)
tfidf_submission.to_csv('tfidf_submission.csv', index=False)

**Training using nltk**

In [None]:
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv/unlabeledTrainData.tsv", delimiter='\t', quoting=3)

In [None]:
import nltk.data
nltk.download('punkt')

In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
#Function to seperate review
def split_to_sentences(review, tokenizer):
    raws = tokenizer.tokenize(review.strip())
    sentences = []
    for raw in raws:
        if len(raw) > 0:
            sentences.append(review_to_text(raw, False))
    return sentences

In [None]:
corpora = []
for review in unlabeled_train['review']:
    corpora += split_to_sentences(review, tokenizer)

In [None]:
"""
Hyperparameters
"""
num_features = 300
min_word_count = 20
num_workers = 3
context = 10
downsampling = 1e-3

In [None]:
from gensim.models import word2vec

In [None]:
model = word2vec.Word2Vec(corpora, workers=num_workers, size=num_features, 
                          min_count=min_word_count, window=context, sample=downsampling)

In [None]:
model.init_sims(replace=True)

In [None]:
#Try to see how the model do
model.most_similar('pee')

In [None]:
import numpy as np

In [None]:
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype='float32')
    nwords=0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [None]:
"""
Convert review to feature vector based on word2vec
"""
def getFeatureVecs(reviews, model, num_features):
    counter=0
    reviewFeatureVecs = np.zeros((len(reviews),num_features), dtype='float32')
    
    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter += 1
    return reviewFeatureVecs

In [None]:
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_to_text(review, remove_stopwords=True))

In [None]:
trainDataVecs = getFeatureVecs(clean_train_reviews, model, num_features)

In [None]:
clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append(review_to_text(review, remove_stopwords=True))

In [None]:
testDataVecs = getFeatureVecs(clean_test_reviews, model, num_features)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
gdb = GradientBoostingClassifier()

In [None]:
params = {'n_estimators':[10,100,500], 'learning_rate':[0.01,0.1,1.0], 'max_depth':[2,3,4]}

In [None]:
gs = GridSearchCV(gdb, params, cv=4, n_jobs=-1, verbose=1)

In [None]:
gs.fit(trainDataVecs, y_train)