# Экспериментальная часть

In [33]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import string

import scipy.io as sio
from scipy.special import rel_entr, kl_div
from scipy.stats import entropy, ks_2samp
from scipy.spatial.distance import jensenshannon
from skmultiflow.drift_detection import DDM, PageHinkley, ADWIN
from skmultiflow.data import ConceptDriftStream
import datetime
from sklearn.datasets import load_svmlight_file
import matplotlib.pyplot as plt
import os

In [34]:
count_vect = CountVectorizer(max_df = 0.9, min_df = 10, ngram_range=(1, 3))
tfidf_vect = TfidfVectorizer(max_df = 0.9, min_df = 10, ngram_range=(1, 3))
porter_stemmer = nltk.stem.PorterStemmer()
tok = count_vect.build_tokenizer()
clf = LogisticRegression(solver='liblinear', C = 2.8, penalty = 'l1')

In [35]:
df_all = pd.read_csv('../Amazon_Electronics/Electronics_5.csv').sample(1000000)
df_all.head(11)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
1530655,A1TR1KNR3VKPHY,B00AQUMZRA,OkinawanMatt,"[0, 0]",This product worked just as advertised! I was...,5,Just as described: Works.,1384646400,"11 17, 2013"
239735,A3JJ6NU21RHUIJ,B000IN09C8,Sue,"[0, 0]","This product has a Garmin logo on it, is very ...",5,Great product,1310774400,"07 16, 2011"
1409519,A2Y6QT6VLLQF51,B008MPVUK4,S,"[0, 0]",I guess we will have to see how long it lasts....,3,Its okay,1374364800,"07 21, 2013"
1389775,A3G3KM1YAYWO1N,B008EO263I,A. Uljamil,"[8, 8]","Being a gadget enthusiast, I buy new tablets a...",5,Wet-application screen protector.... done right!,1343865600,"08 2, 2012"
801380,AA5OOQ1C5T0H8,B003N12TSY,jackson,"[3, 3]",This external player works just great on a Mac...,4,Works For Me!,1304985600,"05 10, 2011"
742038,A38BADCH0JWDRR,B003B4VLJQ,Matthew,"[0, 0]",I have been using TV tuner cards for over 15 y...,5,Great card,1391385600,"02 3, 2014"
781856,A2OR94VN7XQCUO,B003IB51D4,"Edward Scott ""ewscott""","[0, 0]",My past experiences with lower priced I-Pod ty...,5,little pink dynamo!,1314921600,"09 2, 2011"
174866,A227BV4KDDDZHZ,B000BONJXU,salr,"[2, 2]",I got this MP3 player speciffically for audio ...,5,Great for audio books,1183161600,"06 30, 2007"
1341063,A3MG4K76YKZB1T,B007WTAJTO,"Technologies8 ""(Ceterum autem censeo Carthagi...","[0, 0]",Used for a digital camera card.Read speeds tes...,3,"Slow write on laptop, seems ok on camera",1385164800,"11 23, 2013"
943003,A29PS21AMQQB9C,B004G08OO4,Modern Paleo,"[8, 9]",I bought this item to see if I could save some...,1,Only usable to transfer from vinyl to digital ...,1352419200,"11 9, 2012"


In [36]:
df_all.reviewTime = pd.to_datetime(df_all.reviewTime, format="%m %d, %Y")
df_all = df_all.sort_values(by = 'reviewTime').reset_index(drop = True)
df_all["bin_label"] = df_all.overall > 3
df_all["words"] = list(map(lambda elem: (" ").join(map(lambda s: (porter_stemmer.stem(s)), elem)).lower().translate(str.maketrans('', '', string.punctuation)), map(lambda wrds: tok(wrds), df_all.reviewText.apply(lambda x: np.str_(x)))))
df_all

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,bin_label,words
0,A7RV1KU5O0II9,B00000JFMK,"David S. Saunders ""Equal parts tech geek, bra...","[0, 0]",I love this player! I travel frequently and th...,5,Incredible screen!,931392000,1999-07-08,True,love thi player travel frequent and the l50 ke...
1,AJ6TMOHHFJJAJ,B00000JMO4,Ali Navaie,"[5, 5]",If you are looking for the best camcorder in t...,5,Simply the best in the market --- Get it while...,932688000,1999-07-23,True,if you are look for the best camcord in the ma...
2,A1JTSRG8SU4VFO,B00000JFE3,L. Goff,"[0, 0]",The &quot;lip-synch&quot; problem has shown up...,5,Own the Sony 550D model,932688000,1999-07-23,True,the quot lip synch quot problem ha shown up on...
3,A2XW3HYL9YSJGW,B00000J3Q7,"Steven P. Gross ""Good friend, cynical but tho...","[8, 9]",Since I was out of internal bays and I still n...,5,Works fine.... all the time.,935193600,1999-08-21,True,sinc wa out of intern bay and still need zip d...
4,A1JSO7PPEA0W72,B00000JI38,Bibliophile,"[19, 19]","Like the previous reviewer, I also purchased t...",5,Should be standard equipment with Nikon Coolip...,935280000,1999-08-22,True,like the previou review also purchas the kodak...
...,...,...,...,...,...,...,...,...,...,...,...
999995,A37G90KBW3CZXT,B00KHA2DQM,Cms3717,"[0, 0]","Great tablet, I love this thing. Excellent rep...",5,Best tablet I've owned.,1406073600,2014-07-23,True,great tablet love thi thing excel replac for m...
999996,A2R1HUYHXV7H18,B00GMTN96U,Kristi Gilleland,"[0, 0]",I've got several of these bluetooth speakers n...,4,"Punchy bass, small size, big sound",1406073600,2014-07-23,True,ve got sever of these bluetooth speaker now in...
999997,AY6S3QIFWFWQB,B00DYNQR3O,AmzonShopper92,"[0, 0]",Pros:-Unbelievable bass response (as long as t...,4,Happy camper,1406073600,2014-07-23,True,pro unbeliev bass respons as long as the ear t...
999998,A2PVKA64NGHTEG,B00I2ZBD1U,Brad Hanson,"[0, 0]",What a nice little gadget. It is a bit techie ...,5,Records over the air programs for later playback.,1406073600,2014-07-23,True,what nice littl gadget it is bit techi and it ...


In [37]:
df_by_years = {}
for year in df_all.reviewTime.dt.year.unique().tolist():
    df_by_years[year] = df_all[df_all.reviewTime.dt.year == year].reset_index(drop = True)
    print(year, len(df_by_years[year]))
    print(df_by_years[year].bin_label.value_counts(), '\n')

1999 46
True     39
False     7
Name: bin_label, dtype: int64 

2000 490
True     415
False     75
Name: bin_label, dtype: int64 

2001 1002
True     800
False    202
Name: bin_label, dtype: int64 

2002 1354
True     1033
False     321
Name: bin_label, dtype: int64 

2003 2097
True     1572
False     525
Name: bin_label, dtype: int64 

2004 3083
True     2192
False     891
Name: bin_label, dtype: int64 

2005 5755
True     4192
False    1563
Name: bin_label, dtype: int64 

2006 9208
True     6848
False    2360
Name: bin_label, dtype: int64 

2007 21319
True     16820
False     4499
Name: bin_label, dtype: int64 

2008 29482
True     23331
False     6151
Name: bin_label, dtype: int64 

2009 41967
True     32943
False     9024
Name: bin_label, dtype: int64 

2010 61406
True     47527
False    13879
Name: bin_label, dtype: int64 

2011 102369
True     79648
False    22721
Name: bin_label, dtype: int64 

2012 167574
True     132825
False     34749
Name: bin_label, dtype: int64 

2013 3506

In [38]:
data_by_year_splitted = {}
for year in range(2007, 2015):
    train_tmp, test_tmp = train_test_split(df_by_years[year], test_size = 3000, train_size = 18000)
    data_by_year_splitted[year] = {'train_data': train_tmp.words, 'train_labels': train_tmp.bin_label, \
                                  'test_data': test_tmp.words, 'test_labels': test_tmp.bin_label}
    print(year)

2007
2008
2009
2010
2011
2012
2013
2014


In [39]:
df_accuracy = pd.DataFrame(index=range(2007, 2015), columns=range(2007, 2015))
df_accuracy

Unnamed: 0,2007,2008,2009,2010,2011,2012,2013,2014
2007,,,,,,,,
2008,,,,,,,,
2009,,,,,,,,
2010,,,,,,,,
2011,,,,,,,,
2012,,,,,,,,
2013,,,,,,,,
2014,,,,,,,,


In [40]:
for year_train in range(2007, 2015):
    tfidf_vect = TfidfVectorizer(max_df = 0.9, min_df = 10, ngram_range=(1, 3))
    clf = LogisticRegression(solver='liblinear', C = 2.8, penalty = 'l1')
    tfidf_vect.fit(data_by_year_splitted[year_train]['train_data'])
    clf.fit(tfidf_vect.transform(data_by_year_splitted[year_train]['train_data']), data_by_year_splitted[year_train]['train_labels'])
    for year_test in range(year_train, 2015):
        print("Train in {0}, test in {1}".format(year_train, year_test))
        df_accuracy.at[year_train, year_test] = accuracy_score(clf.predict(tfidf_vect.transform(data_by_year_splitted[year_test]['test_data'])),\
                                                                          data_by_year_splitted[year_test]['test_labels'])

Train in 2007, test in 2007
Train in 2007, test in 2008
Train in 2007, test in 2009
Train in 2007, test in 2010
Train in 2007, test in 2011
Train in 2007, test in 2012
Train in 2007, test in 2013
Train in 2007, test in 2014
Train in 2008, test in 2008
Train in 2008, test in 2009
Train in 2008, test in 2010
Train in 2008, test in 2011
Train in 2008, test in 2012
Train in 2008, test in 2013
Train in 2008, test in 2014
Train in 2009, test in 2009
Train in 2009, test in 2010
Train in 2009, test in 2011
Train in 2009, test in 2012
Train in 2009, test in 2013
Train in 2009, test in 2014
Train in 2010, test in 2010
Train in 2010, test in 2011
Train in 2010, test in 2012
Train in 2010, test in 2013
Train in 2010, test in 2014
Train in 2011, test in 2011
Train in 2011, test in 2012
Train in 2011, test in 2013
Train in 2011, test in 2014
Train in 2012, test in 2012
Train in 2012, test in 2013
Train in 2012, test in 2014
Train in 2013, test in 2013
Train in 2013, test in 2014
Train in 2014, test 

In [41]:
ur_style = (df_accuracy
            .style
            .background_gradient(cmap='Greens', axis = 1)
            .highlight_null('white')
            .format("{:.2%}", na_rep=""))
ur_style

Unnamed: 0,2007,2008,2009,2010,2011,2012,2013,2014
2007,86.83%,88.53%,87.90%,86.60%,87.30%,87.30%,87.50%,88.47%
2008,,88.10%,87.97%,86.80%,87.27%,86.50%,87.97%,87.90%
2009,,,88.20%,87.00%,87.73%,86.97%,87.40%,87.97%
2010,,,,86.80%,87.87%,86.97%,87.83%,88.93%
2011,,,,,87.50%,87.53%,87.63%,88.50%
2012,,,,,,87.30%,87.30%,88.83%
2013,,,,,,,88.00%,88.10%
2014,,,,,,,,88.43%
