In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize
import gensim
from gensim.models import word2vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/labeledTrainData.tsv", delimiter="\t", header=0)
test = pd.read_csv("../input/testData.tsv", delimiter="\t", header=0)
unlabeled_train = pd.read_csv("../input/unlabeledTrainData.tsv", delimiter="\t", header=0, quoting=3)

In [None]:
print('%d labeled train reviews\n%d labeled test reviews \
\n%d unlabeled reviews\n' % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

In [None]:
def sent_to_words(sent):
    sent_text = BeautifulSoup(sent).get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", sent_text) 
    words = letters_only.lower()
    words= words.split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]
    return(meaningful_words)

In [None]:
def review_to_sent(raw_review):
    sent = []
    for review in train['review']:
        block_of_sent = sent_tokenize(review)
        for block in block_of_sent:
            if len(block) > 0:
                sent.append(block)
    return sent    

In [None]:
sent_list = review_to_sent(train['review'])

In [None]:
len(sent_list)

In [None]:
sent_list[0]

In [None]:
word_list = []
for sent in sent_list:
    word_list.append(sent_to_words(sent))

In [None]:
len(word_list)

In [None]:
word_list[0]

In [None]:
num_feats = 300       # Word vector dimensionality                      
min_word_count = 40   # Minimum word count  'a word has to be repeated 40 time to be considered'                      
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [None]:
model = word2vec.Word2Vec(sentences=word_list, size=num_feats, window=context, min_count=min_word_count, sample=downsampling)

In [None]:
model.doesnt_match('man woman child kitchen'.split())

In [None]:
model.doesnt_match('banana apple grapes horse'.split())

In [None]:
model.doesnt_match('france england australia tree'.split())

In [None]:
model.doesnt_match("paris berlin london austria".split())

In [None]:
model.most_similar('man')

In [None]:
model.most_similar('good')

In [None]:
model.most_similar('bad')

In [None]:
#create a vector representation for each sentence
def feat_vec(sent, model, num_feats):
    feat_vec = np.zeros(shape=(num_feats,), dtype='float')
    num_words = 0
    word_set = set(model.wv.index2word)
    for word in sent:
        if word in word_set:
            num_words += 1
            feat_vec = np.add(feat_vec, model[word])
    feat_vec = np.divide(feat_vec,num_words)
    return feat_vec

In [None]:
#create a vector representation for each review 
def get_avg_feat_vec(reviews, model , num_feats):
    review_feat_vec = np.zeros(shape=(len(reviews), num_feats), dtype='float')
    for i, review in enumerate(reviews):
        words = sent_to_words(review)
        review_feat_vec[i-1] = feat_vec(words, model, num_feats)   
    return review_feat_vec    

In [None]:
train_vecs = get_avg_feat_vec(train['review'], model, num_feats)

In [None]:
test_vec = get_avg_feat_vec(test['review'], model, num_feats)

In [None]:
from sklearn.cluster import KMeans
import time

In [None]:
start = time.time()

word_vectors = model.wv.syn0  #returns a vector representation of all words (index2word returns the words)
num_clusters = int(word_vectors.shape[0] / 5) #number of clusters
Kmeans_clusters = KMeans(n_clusters = num_clusters) 
idx = Kmeans_clusters.fit_predict(word_vectors)

end = time.time()
elapsed = end - start

print('time elapsed is ', elapsed, 'seconds')

In [None]:
word_centroid_map = dict(zip(model.wv.index2word, idx))

In [None]:
for cluster in range(0,5):
    print("\nCluster %d" % cluster)
    words = []
    for key , value in word_centroid_map.items():
        if (value == cluster):
            words.append(key)
    print(words)

In [None]:
def create_bag_of_centroids(word_list, word_centroid_map):
    
    num_centroids = max( word_centroid_map.values() ) + 1
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    
    for word in word_list:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] +=1
            
    return bag_of_centroids        

In [None]:
def create_reviews_bag_of_centroids(reviews, word_centroid_map):
    
    num_centroids = max( word_centroid_map.values() ) + 1
    
    reviews_bag_of_centroids = np.zeros(shape=(len(reviews), num_centroids))
    
    for i, review in enumerate(reviews):
        reviews_bag_of_centroids[i-1] = create_bag_of_centroids(review.split(), word_centroid_map)
    
    return reviews_bag_of_centroids     

In [None]:
train_bag_of_centroids = create_reviews_bag_of_centroids(train['review'], word_centroid_map)

In [None]:
test_bag_of_centroids = create_reviews_bag_of_centroids(test['review'], word_centroid_map)

In [None]:
print(train_bag_of_centroids.shape)
print(test_bag_of_centroids.shape)

In [None]:
import gc
gc.collect()

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_bag_of_centroids,train["sentiment"])
predictions = forest.predict(test_bag_of_centroids)

In [None]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":predictions} )
output.to_csv( "Submission.csv", index=False)