In [3]:
import re
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

from gensim.models import Word2Vec 
import nltk
from nltk.corpus import stopwords 



In [4]:
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [10]:
def review_to_wordlist(review):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    words = review_text.lower().split()
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    return(words)

In [11]:
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # разбиваем review на предложения. Возвращаем список предложений. Каждое предложение - список слов
    #
    # 1. NLTK Tokenizer требуется для того, чтобы разбить текст на предложения. Разбиваем на предложения
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. идем по каждому предложению
    sentences = []
    for raw_sentence in raw_sentences:
        # если предолжение пустое - пропускаем его
        if len(raw_sentence) > 0:
            # иначе запускаем review_to_wordlist и добавляем в sentences
            sentences.append( review_to_wordlist( raw_sentence))
            
    # возвращаем предложения
    return sentences

In [107]:
train = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [13]:
from IPython.display import clear_output

In [14]:
%%time
sentences = [] 
print("Parsing sentences from training set")
i = 0
for i, review in enumerate(train["review"]):
    clear_output()
    print((i/len(train))*100, '%')
    sentences += review_to_sentences(review, tokenizer)
    i += 1

99.996 %
Wall time: 11min 11s


In [15]:
%%time
print("Parsing sentences from unlabeled set")
i = 0
for review in unlabeled_train["review"]:
    clear_output()
    print((i/len(unlabeled_train))*100, '%')
    sentences += review_to_sentences(review, tokenizer)
    i += 1

99.998 %
Wall time: 21min 9s


In [33]:
print(len(sentences))

795538


In [34]:
print(sentences[0])


['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker']


In [36]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 8     # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [37]:
%%time
# Initialize and train the model (this will take some time)
print("Training model...")
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = 0.0001)


Training model...
Wall time: 55 s


In [38]:
model.init_sims(replace=True)

In [39]:
model_name = "300features_40minwords_10context"
model.save(model_name)

In [43]:
model.doesnt_match("man woman child foo bar kitchen".split())

'child'

In [44]:
model.most_similar("man")

[('woman', 0.712565004825592),
 ('monk', 0.5986726880073547),
 ('servant', 0.5845864415168762),
 ('pursuing', 0.5684902667999268),
 ('wheelchair', 0.5665715336799622),
 ('businessman', 0.5628113746643066),
 ('crippled', 0.5597537159919739),
 ('widow', 0.5597490668296814),
 ('lad', 0.5580651760101318),
 ('tall', 0.5573580265045166)]

In [67]:
model.most_similar("dragon")

[('crouching', 0.8941984176635742),
 ('tiger', 0.8735963106155396),
 ('sword', 0.8582170009613037),
 ('ninja', 0.8013341426849365),
 ('jet', 0.795806884765625),
 ('swords', 0.7900489568710327),
 ('chun', 0.7713428735733032),
 ('iron', 0.7652692794799805),
 ('fist', 0.7584738731384277),
 ('warriors', 0.757693886756897)]

In [68]:
model.most_similar("star")

[('stars', 0.7458279132843018),
 ('studded', 0.6530028581619263),
 ('stardom', 0.6027533411979675),
 ('trek', 0.565423309803009),
 ('guest', 0.5487549304962158),
 ('billed', 0.5418476462364197),
 ('fame', 0.5346068739891052),
 ('superstar', 0.5146893262863159),
 ('vehicle', 0.5116539001464844),
 ('hudson', 0.510845422744751)]

In [69]:
model.most_similar("jedi")

[('sith', 0.8024415969848633),
 ('anakin', 0.7901677489280701),
 ('ewoks', 0.7738562226295471),
 ('skywalker', 0.772915780544281),
 ('vader', 0.7552484273910522),
 ('saber', 0.7525895237922668),
 ('jabba', 0.7419978976249695),
 ('galaxy', 0.7411889433860779),
 ('darth', 0.7325034141540527),
 ('terminator', 0.7224287390708923)]

In [70]:
model.most_similar("harry")

[('callahan', 0.7826710939407349),
 ('potter', 0.7791247963905334),
 ('enforcer', 0.7310588955879211),
 ('lime', 0.7012282609939575),
 ('marv', 0.6921302080154419),
 ('stanton', 0.6798675060272217),
 ('clint', 0.651884913444519),
 ('davenport', 0.6371444463729858),
 ('eastwood', 0.6307017803192139),
 ('dirty', 0.6061629056930542)]

In [74]:
def makeFeatureVec(words, model, num_features):
    # берем документ и считаем средний вектор по всем словам
    # paragraph
    #
    # берем вектор, инициализируем изначально нулями
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0
    # 
    # Index2word - содержит имена слов в словаре, чтобы по нему искать, лучше опять же, для скорости - положить его в set 

    index2word_set = set(model.wv.index2word)
    #
    # бежим по каждому слову в документе и если слово встречается в словаре - добавляем его в ответ 
    # (прибавляем к результирующему вектору)
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # теперь соответственно делим на количество слов всего
    featureVec = np.divide(featureVec,nwords)
    return featureVec



In [80]:

def getAvgFeatureVecs(reviews, model, num_features):
    # эта функция берет на вход набор документов и для каждого из них возаращаетя средний вектор - полчается на выходе 2D-массив
    # 
    # инициализируем счетчик
    counter = 0
    # 
    # точно также заполним нулями вектора (для скорости)
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # идем по всем ревью
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%5000 == 0:
           print("Review %d of %d" % (counter, len(reviews)))
       # 
       # для каждого ревью считаем средний вектор
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
       #
       # увеличиваем счетчик
       counter = counter + 1
    return reviewFeatureVecs

In [64]:
%%time
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Wall time: 55.9 s


In [81]:
%time 
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features )

Wall time: 0 ns
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000


In [73]:
model.wv.vocab.keys()



In [84]:
from sklearn.model_selection import GridSearchCV
params_rf = {'n_estimators': [x for x in range(180,240,20)]}


RF_grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params_rf, n_jobs=-1, cv=5)
print("Fitting a random forest to labeled training data...")
forest = RF_grid.fit( trainDataVecs, train["sentiment"] )
print('\n',RF_grid.best_estimator_)




Fitting a random forest to labeled training data...

 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=220, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


In [93]:
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )
print(test.shape)

num_reviews = len(test["review"])
print(num_reviews)

(25000, 2)
25000


In [94]:
%%time
print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review ))

Creating average feature vecs for test reviews




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Wall time: 57.2 s


In [95]:
%time 
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Wall time: 0 ns
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000


In [96]:
result = RF_grid.best_estimator_.predict( testDataVecs )
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )

In [97]:
#0.84696 kaggle.

In [16]:
# Try with another params
num_features = 750    # Word vector dimensionality                      
min_word_count = 20   # Minimum word count                        
num_workers = 8       # Number of threads to run in parallel
context = 6         # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [17]:
%%time
print("Training model...")
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)


Training model...
Wall time: 2min


In [53]:
from collections import Counter
import math

def compute_tfidf(corpus):
    def compute_tf(text):
        tf_text = Counter(text)
        for i in tf_text:
            tf_text[i] = tf_text[i]/float(len(text))
        return tf_text
    def compute_idf(word, corpus):
        return math.log10(len(corpus)/sum([1.0 for i in corpus if word in i]))
    documents_list = []
    i = 0
    for text in corpus:
        print("Review %d of %d" % (i, len(corpus)))
        i += 1
        tf_idf_dictionary = {}
        computed_tf = compute_tf(text)
        for word in computed_tf:
            tf_idf_dictionary[word] = computed_tf[word] * compute_idf(word, corpus)
        documents_list.append(tf_idf_dictionary)
    return documents_list

In [18]:
import collections

def compute_tf(text):
    tf_text = collections.Counter(text)
    for i in tf_text:
        tf_text[i] = tf_text[i]/float(len(text))
    return tf_text

In [19]:
import math

def compute_idf(word, corpus):
    return math.log10(len(corpus)/sum([1.0 for i in corpus if word in i]))

In [55]:
def makeFeatureVec(words, model, num_features, reviews):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set : 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,nwords)
    return featureVec



In [56]:
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
       if counter%500 == 0:
           print("Review %d of %d" % (counter, len(reviews)))
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features, reviews)
       counter = counter + 1
    return reviewFeatureVecs

In [24]:
%%time
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Wall time: 56.9 s


In [59]:
%time 
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features )

Wall time: 0 ns
Review 0 of 25000
Review 500 of 25000
Review 1000 of 25000
Review 1500 of 25000
Review 2000 of 25000
Review 2500 of 25000
Review 3000 of 25000
Review 3500 of 25000
Review 4000 of 25000
Review 4500 of 25000
Review 5000 of 25000
Review 5500 of 25000
Review 6000 of 25000
Review 6500 of 25000
Review 7000 of 25000
Review 7500 of 25000
Review 8000 of 25000
Review 8500 of 25000
Review 9000 of 25000
Review 9500 of 25000
Review 10000 of 25000
Review 10500 of 25000
Review 11000 of 25000
Review 11500 of 25000
Review 12000 of 25000
Review 12500 of 25000
Review 13000 of 25000
Review 13500 of 25000
Review 14000 of 25000
Review 14500 of 25000
Review 15000 of 25000
Review 15500 of 25000
Review 16000 of 25000
Review 16500 of 25000
Review 17000 of 25000
Review 17500 of 25000
Review 18000 of 25000
Review 18500 of 25000
Review 19000 of 25000
Review 19500 of 25000
Review 20000 of 25000
Review 20500 of 25000
Review 21000 of 25000
Review 21500 of 25000
Review 22000 of 25000
Review 22500 of 25

In [60]:
from sklearn.model_selection import GridSearchCV
params_rf = {'n_estimators': [x for x in range(160,250,20)]}

RF_grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params_rf, n_jobs=-1, cv=5)
print("Fitting a random forest to labeled training data...")
forest = RF_grid.fit( trainDataVecs, train["sentiment"] )
print('\n',RF_grid.best_estimator_)




Fitting a random forest to labeled training data...

 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=220, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


In [178]:
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )


In [63]:
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review ))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [64]:
%time 
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Wall time: 0 ns
Review 0 of 25000
Review 500 of 25000
Review 1000 of 25000
Review 1500 of 25000
Review 2000 of 25000
Review 2500 of 25000
Review 3000 of 25000
Review 3500 of 25000
Review 4000 of 25000
Review 4500 of 25000
Review 5000 of 25000
Review 5500 of 25000
Review 6000 of 25000
Review 6500 of 25000
Review 7000 of 25000
Review 7500 of 25000
Review 8000 of 25000
Review 8500 of 25000
Review 9000 of 25000
Review 9500 of 25000
Review 10000 of 25000
Review 10500 of 25000
Review 11000 of 25000
Review 11500 of 25000
Review 12000 of 25000
Review 12500 of 25000
Review 13000 of 25000
Review 13500 of 25000
Review 14000 of 25000
Review 14500 of 25000
Review 15000 of 25000
Review 15500 of 25000
Review 16000 of 25000
Review 16500 of 25000
Review 17000 of 25000
Review 17500 of 25000
Review 18000 of 25000
Review 18500 of 25000
Review 19000 of 25000
Review 19500 of 25000
Review 20000 of 25000
Review 20500 of 25000
Review 21000 of 25000
Review 21500 of 25000
Review 22000 of 25000
Review 22500 of 25

In [66]:
result = RF_grid.best_estimator_.predict( testDataVecs )
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageVectors_750_features.csv", index=False, quoting=3 )

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

In [78]:
tfidf = TfidfVectorizer()


In [146]:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=num_features,
                                 min_df=2, stop_words='english',
                                 use_idf=True)
X = vectorizer.fit(train['review'])


In [144]:
train.shape

(25000, 3)

In [152]:
X.vocabulary_.items()

dict_items([('stuff', 628), ('going', 271), ('moment', 423), ('ve', 694), ('started', 617), ('music', 432), ('watching', 713), ('documentary', 158), ('watched', 712), ('maybe', 405), ('just', 339), ('want', 705), ('certain', 86), ('guy', 280), ('thought', 658), ('really', 523), ('cool', 115), ('make', 396), ('mind', 418), ('feature', 224), ('remember', 530), ('cinema', 98), ('released', 529), ('feeling', 227), ('obvious', 449), ('message', 415), ('bad', 41), ('course', 120), ('michael', 416), ('unless', 687), ('like', 373), ('hate', 291), ('boring', 62), ('making', 398), ('fans', 217), ('say', 553), ('true', 672), ('nice', 443), ('actual', 12), ('bit', 57), ('finally', 238), ('starts', 618), ('20', 1), ('minutes', 420), ('sequence', 573), ('joe', 335), ('powerful', 501), ('wants', 707), ('dead', 134), ('character', 90), ('wanted', 706), ('people', 471), ('know', 351), ('lots', 390), ('things', 655), ('car', 80), ('director', 153), ('came', 78), ('usually', 692), ('directors', 154), ('w

In [150]:
max_idf = max(X.idf_)
max_idf

5.4918814998811092

In [155]:
X.idf_[1]

4.6620827993603982

In [156]:
word2weight = defaultdict(
            lambda: max_idf,
            [(w, X.idf_[i]) for w, i in X.vocabulary_.items()])


In [163]:
s = 0
for item, value in word2weight.items():
    s += value
    

In [165]:
avg_weight = s/len(word2weight)

In [166]:
avg_weight

4.1466395047671964

In [130]:
X = X.toarray()

In [137]:
# Try with another params
num_features = 750    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 8       # Number of threads to run in parallel
context = 10         # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [138]:
%%time
print("Training model...")
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)


Training model...
Wall time: 2min 2s


In [167]:
def makeFeatureVec(words, model, num_features, reviews):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set and word2weight[word] > avg_weight: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,nwords)
    return featureVec



In [168]:
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
       if counter%500 == 0:
           print("Review %d of %d" % (counter, len(reviews)))
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features, reviews)
       counter = counter + 1
    return reviewFeatureVecs

In [169]:
%time 
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features )

Wall time: 0 ns
Review 0 of 25000
Review 500 of 25000
Review 1000 of 25000
Review 1500 of 25000
Review 2000 of 25000
Review 2500 of 25000
Review 3000 of 25000
Review 3500 of 25000
Review 4000 of 25000
Review 4500 of 25000
Review 5000 of 25000
Review 5500 of 25000
Review 6000 of 25000
Review 6500 of 25000
Review 7000 of 25000
Review 7500 of 25000
Review 8000 of 25000
Review 8500 of 25000
Review 9000 of 25000
Review 9500 of 25000
Review 10000 of 25000
Review 10500 of 25000
Review 11000 of 25000
Review 11500 of 25000
Review 12000 of 25000
Review 12500 of 25000
Review 13000 of 25000
Review 13500 of 25000
Review 14000 of 25000
Review 14500 of 25000
Review 15000 of 25000
Review 15500 of 25000
Review 16000 of 25000
Review 16500 of 25000
Review 17000 of 25000
Review 17500 of 25000
Review 18000 of 25000
Review 18500 of 25000
Review 19000 of 25000
Review 19500 of 25000
Review 20000 of 25000
Review 20500 of 25000
Review 21000 of 25000
Review 21500 of 25000
Review 22000 of 25000
Review 22500 of 25

AttributeError: 'numpy.ndarray' object has no attribute 'vocabulary_'

In [170]:

forest = RandomForestClassifier(n_estimators=220, n_jobs=-1)
print("Fitting a random forest to labeled training data...")
forest = forest.fit( trainDataVecs, train["sentiment"] )


Fitting a random forest to labeled training data...

 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=220, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


In [171]:
%time 
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Wall time: 0 ns
Review 0 of 25000
Review 500 of 25000
Review 1000 of 25000
Review 1500 of 25000
Review 2000 of 25000
Review 2500 of 25000
Review 3000 of 25000
Review 3500 of 25000
Review 4000 of 25000
Review 4500 of 25000
Review 5000 of 25000
Review 5500 of 25000
Review 6000 of 25000
Review 6500 of 25000
Review 7000 of 25000
Review 7500 of 25000
Review 8000 of 25000
Review 8500 of 25000
Review 9000 of 25000
Review 9500 of 25000
Review 10000 of 25000
Review 10500 of 25000
Review 11000 of 25000
Review 11500 of 25000
Review 12000 of 25000
Review 12500 of 25000
Review 13000 of 25000
Review 13500 of 25000
Review 14000 of 25000
Review 14500 of 25000
Review 15000 of 25000
Review 15500 of 25000
Review 16000 of 25000
Review 16500 of 25000
Review 17000 of 25000
Review 17500 of 25000
Review 18000 of 25000
Review 18500 of 25000
Review 19000 of 25000
Review 19500 of 25000
Review 20000 of 25000
Review 20500 of 25000
Review 21000 of 25000
Review 21500 of 25000
Review 22000 of 25000
Review 22500 of 25

In [174]:
result = forest.predict( testDataVecs )

In [180]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

In [181]:
output.to_csv( "Word2Vec_AverageVectors_with_tf-idf.csv", index=False, quoting=3 )

In [182]:
#kaggle 0.81392
