In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from time import time
import pandas as pd
import numpy

In [2]:
#data prepration
data = pd.read_csv("/Users/fabbas1/Google Drive/study/Phd/Machine Learning/assignment/ITCS6156_SLProject/AmazonReviews/amazon_baby_train_clean.csv")
data_test = pd.read_csv("/Users/fabbas1/Google Drive/study/Phd/Machine Learning/assignment/ITCS6156_SLProject/AmazonReviews/amazon_baby_test_clean.csv")

#drop unnecessary columns
del data['Unnamed: 0']
del data['name_processed']
del data['review_processed']
del data_test['Unnamed: 0']
del data_test['name_processed']
del data_test['review_processed']


In [16]:
# min_df values
min_df_values = [50,100,150,200,250,300,350,400,450,500]
for min_df_value in min_df_values:
    count_vect = CountVectorizer(min_df=min_df_value)
    x_train = count_vect.fit_transform(data.merged.values.astype('U'))    
    print(min_df_value, " " , x_train.shape )
#x_test = count_vect.transform(data_test["merged"])

50   (145927, 5328)
100   (145927, 3856)
150   (145927, 3149)
200   (145927, 2727)
250   (145927, 2443)
300   (145927, 2216)
350   (145927, 2050)
400   (145927, 1908)
450   (145927, 1781)
500   (145927, 1662)


In [17]:
count_vect = CountVectorizer(min_df=min_df_value)
x_train = count_vect.fit_transform(data.merged.values.astype('U'))    
x_test = count_vect.transform(data_test["merged"])

In [18]:
y_train = data['rating']
y_test = data_test['rating']

In [19]:
# build a normalizer
scaler = StandardScaler(with_mean=False)
# normalize training and test set between [-1,1] with 0 mean and 1 standard deviation
scaler.fit(x_train)  
x_train_normalize = scaler.transform(x_train)  
x_test_normalize = scaler.transform(x_test)



In [20]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(2, 2), random_state=1)

In [21]:
# measure time
start = time()
clf.fit(x_train,y_train.values.ravel())
print( "Train - Without normalizing " , clf.score(x_train,y_train.values.ravel()) , " in " , time() - start , " seconds")
start = time()
clf.fit(x_train_normalize,y_train.values.ravel())
print( "Train - With normalizing " , clf.score(x_train_normalize,y_train.values.ravel()) , " in " , time() - start , " seconds" )

Train - Without normalizing  0.675495281888  in  33.304543018341064  seconds
Train - With normalizing  0.673857476684  in  26.0003981590271  seconds


In [24]:
#‘identity’, ‘logistic’, ‘tanh’, ‘relu’
activation_functions = ['identity', 'logistic', 'tanh', 'relu']
for activation_function in activation_functions:
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, activation=activation_function,
                     hidden_layer_sizes=(2, 2), random_state=1)
    print(activation_function)
    # measure time
    start = time()
    clf.fit(x_train,y_train.values.ravel())
    print( "Without normalizing " , clf.score(x_train,y_train.values.ravel()) , " in " , time() - start , " seconds")
    start = time()
    clf.fit(x_train_normalize,y_train.values.ravel())
    print( "With normalizing " , clf.score(x_train_normalize,y_train.values.ravel()) , " in " , time() - start , " seconds" )

identity
Without normalizing  0.675687158648  in  28.822536945343018  seconds
With normalizing  0.675577514785  in  21.407979011535645  seconds
logistic
Without normalizing  0.677640189958  in  34.896371841430664  seconds
With normalizing  0.643547801298  in  29.43972110748291  seconds
tanh
Without normalizing  0.677044001453  in  39.13399600982666  seconds
With normalizing  0.676811008244  in  31.853296995162964  seconds
relu
Without normalizing  0.675495281888  in  37.5686571598053  seconds
With normalizing  0.673857476684  in  28.494165897369385  seconds


In [25]:
#‘identity’, ‘logistic’, ‘tanh’, ‘relu’
weight_optimizer = ['sgd', 'lbfgs', 'adam' ]
for solver in weight_optimizer:
    clf = MLPClassifier(solver=solver, alpha=1e-5, activation='identity',
                     hidden_layer_sizes=(2, 2), random_state=1)
    print(solver)
    # measure time
    start = time()
    clf.fit(x_train_normalize,y_train.values.ravel())
    print( clf.score(x_train_normalize,y_train.values.ravel()) , " in " , time() - start , " seconds" )

sgd
0.674905946124  in  24.80126714706421  seconds
lbfgs
0.675577514785  in  23.99100112915039  seconds
adam
0.675070411918  in  50.410775899887085  seconds


In [27]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
for i in range(1,8):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, activation='identity',
                     hidden_layer_sizes=(i,), random_state=1)
    start = time()
    scores = cross_val_score(clf, x_train_normalize, y_train.values.ravel(), cv=cv)
    print(str(i) , " " , scores.mean() , " " , time() - start)

1   0.62987277005   15.924906969070435
2   0.66621896343   41.56416893005371
3   0.669101624066   63.925057888031006
4   0.668133123187   96.44778800010681
5   0.668215354394   111.33684587478638
6   0.668151396788   90.71089696884155
7   0.668183375591   106.03336691856384


KeyboardInterrupt: 

In [29]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
for j in range(1,11):
    for i in range(1,8):
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, activation='identity',
                         hidden_layer_sizes=(i,j), random_state=1)
        start = time()
        scores = cross_val_score(clf, x_train_normalize, y_train.values.ravel(), cv=cv)
        print(str(j) , " " , str(i) , " " , scores.mean() , " " , time() - start)

1   1   0.62987277005   16.85921287536621
1   2   0.629868201649   30.49816608428955
1   3   0.629891043651   24.099575996398926
1   4   0.629891043651   22.664977073669434
1   5   0.629923022454   27.34083104133606
1   6   0.629891043651   34.44629788398743
1   7   0.629945864456   37.958130836486816
2   1   0.629822517645   32.23588991165161
2   2   0.666186984627   77.3036630153656
2   3   0.666205258229   72.6077778339386
2   4   0.666228100231   78.97756290435791
2   5   0.666250942233   56.92984199523926
2   6   0.666246373832   52.587766885757446
2   7   0.666173279426   59.60202503204346
3   1   0.629891043651   38.26418709754944
3   2   0.666228100231   77.42074704170227
3   3   0.668868635647   101.28769397735596
3   4   0.669097055666   77.0590238571167
3   5   0.66902396126   91.62277698516846


KeyboardInterrupt: 

In [30]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, activation='identity',
                     hidden_layer_sizes=(3,), random_state=1,max_iter=200)
clf.fit(x_train_normalize, y_train.values.ravel())
clf.score(x_test_normalize,y_test)

0.66993992923169765