In [2]:
import pandas as pd
import numpy as np

import re
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.model_selection import cross_val_score

from bs4 import BeautifulSoup
from tqdm import tqdm

# Import data

In [3]:
train = pd.read_csv('labeledTrainData.tsv', delimiter="\t", header=0, quoting=3)
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3)
unsup = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3)

In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
train.shape, test.shape, unsup.shape

((25000, 3), (25000, 2), (50000, 2))

In [6]:
train.iloc[200]['review'], train.iloc[200]['sentiment']

('"It takes patience to get through David Lynch\'s eccentric, but-- for a change-- life-affirming chronicle of Alvin Straight\'s journey, but stick with it. Though it moves as slow as Straight\'s John Deere, when he meets the kind strangers along his pilgrimage we learn much about the isolation of aging, the painful regrets and secrets, and ultimately the power of family and reconciliation. Richard Farnsworth caps his career with the year\'s most genuine performance, sad and poetic, flinty and caring. And Sissy Spacek matches him as his \\"slow\\" daughter Rose who pines over her own private loss while caring for dad. Rarely has a modern film preached so positively about family."',
 1)

In [7]:
train.iloc[99]['review'], train.iloc[99]['sentiment']

('"I may not be a critic, but here is what I think of this movie. Well just watched the movie on cinemax and first of all I just have to say how much I hate the storyline I mean come on what does a snowman scare besides little kids, secondly it is pretty gory but I bet since the movie is so low budget they probably used ketchup so MY CRITICAL VOTE IS BOMB!!! nice try and the sequel will suck twice as much."',
 0)

# Preprocessing

In [8]:
# nltk.download('wordnet')
# nltk.download('punkt')

In [10]:
lmtzr = WordNetLemmatizer()

In [11]:
def text2wordlist(text):
    # replace negatives (don't -> do not)
    text = re.sub('n\'t', ' not ', text)
    text = re.sub('nt', ' not ', text)
    # only select words
    text = re.sub('[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    # lemmatize words
    words = [lmtzr.lemmatize(word, 'n') for word in words]
    words = [lmtzr.lemmatize(word, 'a') for word in words]
    words = [lmtzr.lemmatize(word, 'v') for word in words]
    # remove stop words from text 
    # save several stop words: negative texts (0) contains negative words (not, no, etc) 
    my_stopwords = [sw for sw in set(stopwords.words('english')) if sw not in ['not', 'no', 'very', 'but',
                                                                               'above', 'below']] 
    words = [lmtzr.lemmatize(word, 'v') for word in words if word not in my_stopwords]
    return words

text2wordlist("""
'"I may not be a critic, but here is what I think of this movie. Well just watched the movie on cinemax and 
first of all I just have to say how much I hate the storyline I mean come on what does a snowman scare besides
little kids, secondly it is pretty gory but I bet since the movie is so low budget they probably used ketchup 
so MY CRITICAL VOTE IS BOMB!!! nice try and the sequel will suck twice as much."'
""")

['may',
 'not',
 'critic',
 'but',
 'think',
 'movie',
 'well',
 'watch',
 'movie',
 'cinemax',
 'first',
 'say',
 'much',
 'hate',
 'storyline',
 'mean',
 'come',
 'doe',
 'snowman',
 'scare',
 'besides',
 'little',
 'kid',
 'secondly',
 'pretty',
 'gory',
 'but',
 'bet',
 'since',
 'movie',
 'low',
 'budget',
 'probably',
 'use',
 'ketchup',
 'critical',
 'vote',
 'bomb',
 'nice',
 'try',
 'sequel',
 'suck',
 'twice',
 'much']

In [12]:
def text2sentences(text):
    # remove html tags
    text = BeautifulSoup(text, 'lxml').get_text()
    # separate text to sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(text.strip())
    # separate sentences to words
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(text2wordlist(raw_sentence))            
    return sentences

text2sentences("""
'"I may not be a critic, but here is what I think of this movie. Well just watched the movie on cinemax 
and first of all I just have to say how much I hate the storyline I mean come on what does a snowman scare
besides little kids, secondly it is pretty gory but I bet since the movie is so low budget they probably 
used ketchup so MY CRITICAL VOTE IS BOMB!!! nice try and the sequel will suck twice as much."'
""")

[['may', 'not', 'critic', 'but', 'think', 'movie'],
 ['well',
  'watch',
  'movie',
  'cinemax',
  'first',
  'say',
  'much',
  'hate',
  'storyline',
  'mean',
  'come',
  'doe',
  'snowman',
  'scare',
  'besides',
  'little',
  'kid',
  'secondly',
  'pretty',
  'gory',
  'but',
  'bet',
  'since',
  'movie',
  'low',
  'budget',
  'probably',
  'use',
  'ketchup',
  'critical',
  'vote',
  'bomb'],
 ['nice', 'try', 'sequel', 'suck', 'twice', 'much']]

# w2v model

In [13]:
%%time
sentences = []
for review in tqdm(train['review']):
    sentences += text2sentences(review)

100%|██████████| 25000/25000 [03:15<00:00, 127.80it/s]

CPU times: user 3min 12s, sys: 3.23 s, total: 3min 16s
Wall time: 3min 15s





In [14]:
%%time
for review in tqdm(test['review']):
    sentences += text2sentences(review)

100%|██████████| 25000/25000 [03:11<00:00, 130.28it/s]

CPU times: user 3min 9s, sys: 3.27 s, total: 3min 12s
Wall time: 3min 11s





In [15]:
%%time
for review in tqdm(unsup['review']):
    sentences += text2sentences(review)

100%|██████████| 50000/50000 [06:31<00:00, 127.68it/s]

CPU times: user 6min 25s, sys: 6.84 s, total: 6min 32s
Wall time: 6min 31s





In [17]:
len(sentences)

1059231

# try LDA

In [18]:
from gensim import corpora, models
dictionary = corpora.Dictionary(sentences)   # составляем словарь
corpus = [dictionary.doc2bow(text) for text in sentences]  # составляем корпус документов

In [19]:
%%time
np.random.seed(42)
ldamodel = models.LdaMulticore(corpus, workers=20, id2word=dictionary, num_topics=20)

CPU times: user 2min 28s, sys: 1min 24s, total: 3min 52s
Wall time: 3min 27s


In [22]:
top_ing_n = ldamodel.show_topics(num_topics=2, num_words=10, formatted=False)
top_ing_n

[(6,
  [('not', 0.05376204333741923),
   ('character', 0.014844934679216357),
   ('but', 0.009855161645466343),
   ('doe', 0.007530977342214983),
   ('one', 0.007269178027699474),
   ('like', 0.006551164832035609),
   ('actor', 0.006389247975043039),
   ('movie', 0.006326466319359115),
   ('film', 0.0062676844877654035),
   ('performance', 0.005405867063733255)]),
 (17,
  [('not', 0.03954013393021745),
   ('but', 0.03158350597648299),
   ('see', 0.031090422613056952),
   ('movie', 0.026006981974337436),
   ('wa', 0.018000050070880864),
   ('get', 0.013941662123378844),
   ('film', 0.010350212400966714),
   ('time', 0.008906628657249182),
   ('good', 0.008390950177483113),
   ('bad', 0.007210367678166351)])]

# .......................................... not rly

In [323]:
%%time

from gensim.models.word2vec import Word2Vec

num_features = 300 # размерность вектора каждого слова
min_word_count = 5 # минимальная частотность слова, чтобы она попала в модель
num_workers = 20 # кол-во ядер процессора
context = 10 # размер окна
downsampling = 2e-3 # probability
skip_gram = 1 # 0 or 1
# negative_sampling = 10 # If > 0: how many “noise words” should be drawn
# epochs = 3 # Number of iterations (epochs) over the corpus.

model = Word2Vec(sentences, 
                 workers=num_workers, 
                 size=num_features, 
                 min_count=min_word_count, 
                 window=context, 
                 sample=downsampling, 
                 sg=skip_gram, 
#                  negative=negative_sampling,
#                  iter=epochs
                )

CPU times: user 50min 11s, sys: 8.92 s, total: 50min 20s
Wall time: 1min 33s


In [324]:
model.init_sims(replace=True) # финализируем модель

In [325]:
model.most_similar('critical')

[('critically', 0.5451637506484985),
 ('flak', 0.5068663954734802),
 ('laudatory', 0.4998537600040436),
 ('undeserved', 0.4995395243167877),
 ('lenie', 0.4949934780597687),
 ('critic', 0.4895322918891907),
 ('reappraisal', 0.48356980085372925),
 ('rebuttal', 0.47935742139816284),
 ('unreasonably', 0.4787167012691498),
 ('unmercifully', 0.4784488081932068)]

In [326]:
model.most_similar('hate')

[('marmite', 0.569459080696106),
 ('despise', 0.5493627190589905),
 ('hater', 0.530159056186676),
 ('otakus', 0.5273692011833191),
 ('dislike', 0.5256993770599365),
 ('catdog', 0.515252947807312),
 ('sterotype', 0.5144835710525513),
 ('detest', 0.508788526058197),
 ('love', 0.5044335126876831),
 ('charactors', 0.5035682916641235)]

In [327]:
model.most_similar('great')

[('wonderful', 0.668364405632019),
 ('terrific', 0.6540890336036682),
 ('good', 0.618759036064148),
 ('awsome', 0.6133989691734314),
 ('excelle', 0.6120513677597046),
 ('outstanding', 0.6020387411117554),
 ('fine', 0.6007612943649292),
 ('excele', 0.5831233263015747),
 ('superb', 0.5823565721511841),
 ('incredible', 0.5810086727142334)]

In [328]:
def text2vec(words, model, size):
    text_vec = np.zeros((size,), dtype='float32')
    n_words = 0    
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            n_words += 1
            text_vec = np.add(text_vec, model[word])    
    if n_words != 0:
        text_vec /= n_words    
    return text_vec    

def texts2vecs(texts, model, size):
    texts_vecs = np.zeros((len(texts), size), dtype='float32')    
    for i, text in enumerate(texts):
        texts_vecs[i] = text2vec(text, model, size)        
    return texts_vecs

In [329]:
%%time
train_like_word_list = [sum(text2sentences(text), []) for text in train['review']]
train_vecs = texts2vecs(train_like_word_list, model, num_features)

CPU times: user 4min 53s, sys: 6.91 s, total: 5min
Wall time: 4min 57s


In [330]:
train_vecs.shape

(25000, 300)

In [331]:
%%time
test_like_word_list = [sum(text2sentences(text), []) for text in test['review']]
test_vecs = texts2vecs(test_like_word_list, model, num_features)

CPU times: user 4min 47s, sys: 2.65 s, total: 4min 50s
Wall time: 4min 49s


In [332]:
test_vecs.shape

(25000, 300)

In [333]:
# %%time
# from sklearn.ensemble import RandomForestClassifier

# forest = RandomForestClassifier(n_estimators=1000, n_jobs=20)
# forest = forest.fit(train_vecs, train['sentiment'])

# predict = forest.predict(test_vecs)

# # Copy the results to a pandas dataframe with an "id" column an a "sentiment" column
# output = pd.DataFrame(data={"id":test["id"], "sentiment":predict})

# # Use pandas to write the comma-separated output file
# output.to_csv("w2v_rf_1000_results.csv", index=False, quoting=3)   

In [334]:
xgb_params = {
    'max_depth': 10, 
    'n_estimators': 1000,
    'learning_rate': 0.1,   
    'silent': 1,
    'nthread': 15,
    'subsample': 0.95, 
    'colsample_bytree': 0.95,
    'colsample_bylevel': 1.0,
    'min_child_weight': 2.0, 
    'scale_pos_weight': 1.0,
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metrics':'accuracy', 
    'eta': 0.275,
    'alpha': 0.05,
    'gamma': 0.61,
    'seed': 27
    }

dtrain = xgb.DMatrix(train_vecs, label=train['sentiment'])

In [335]:
watchlist = [(dtrain, 'train')]
# num_boost_round: number of boosting iterations 
bst = xgb.train(xgb_params, dtrain, num_boost_round=150, evals=watchlist)

[0]	train-error:0.11428
[1]	train-error:0.0846
[2]	train-error:0.06644
[3]	train-error:0.05972
[4]	train-error:0.0536
[5]	train-error:0.05136
[6]	train-error:0.0456
[7]	train-error:0.04352
[8]	train-error:0.04032
[9]	train-error:0.03872
[10]	train-error:0.0356
[11]	train-error:0.0322
[12]	train-error:0.0306
[13]	train-error:0.02812
[14]	train-error:0.02632
[15]	train-error:0.02408
[16]	train-error:0.02144
[17]	train-error:0.01976
[18]	train-error:0.0184
[19]	train-error:0.01712
[20]	train-error:0.01632
[21]	train-error:0.01504
[22]	train-error:0.01444
[23]	train-error:0.01348
[24]	train-error:0.01236
[25]	train-error:0.01152
[26]	train-error:0.01064
[27]	train-error:0.00988
[28]	train-error:0.0092
[29]	train-error:0.00836
[30]	train-error:0.00748
[31]	train-error:0.00696
[32]	train-error:0.0062
[33]	train-error:0.00556
[34]	train-error:0.00524
[35]	train-error:0.0048
[36]	train-error:0.0044
[37]	train-error:0.00376
[38]	train-error:0.00368
[39]	train-error:0.00356
[40]	train-error:0.00

In [336]:
dtest = xgb.DMatrix(test_vecs)

In [337]:
ypred = bst.predict(dtest)

In [338]:
ypred

array([9.9884903e-01, 9.0808643e-04, 3.5563290e-01, ..., 6.6848427e-01,
       9.9912506e-01, 1.6750295e-01], dtype=float32)

In [339]:
# Copy the results to a pandas dataframe with an "id" column an a "sentiment" column
output = pd.DataFrame(data={"id":test["id"], "sentiment":ypred})
# Use pandas to write the comma-separated output file
output.to_csv("w2v_xgb_results_5.csv", index=False, quoting=3)   