In [7]:
import numpy as np
import pandas as pd
from time import time
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
#data prepration
data = pd.read_csv("/Users/fabbas1/Google Drive/study/Phd/Machine Learning/assignment/ITCS6156_SLProject/AmazonReviews/amazon_baby_train_clean.csv")
data_test = pd.read_csv("/Users/fabbas1/Google Drive/study/Phd/Machine Learning/assignment/ITCS6156_SLProject/AmazonReviews/amazon_baby_test_clean.csv")

#drop unnecessary columns
del data['Unnamed: 0']
del data['name_processed']
del data['review_processed']
del data_test['Unnamed: 0']
del data_test['name_processed']
del data_test['review_processed']

In [9]:
# create a tf-idf matrix
transformer = CountVectorizer(min_df=150)
x_train = transformer.fit_transform(data.merged.values.astype('U'))
x_test = transformer.transform(data_test["merged"])
y_train = data['rating']
y_test = data_test['rating']
x_train.shape

(145927, 3149)

In [10]:
# build a normalizer
scaler = StandardScaler(with_mean=False)
# normalize training and test set between [-1,1] with 0 mean and 1 standard deviation
scaler.fit(x_train)  
x_train_normalize = scaler.transform(x_train)  
x_test_normalize = scaler.transform(x_test)



In [5]:
# actual decision tree (base case)
dt = DecisionTreeClassifier(max_depth=11, criterion='entropy')
dt.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [6]:
# scores
print( dt.score(x_train, y_train) )
print( dt.score(x_test, y_test) )

0.628485475615
0.607071344323


In [12]:

'0123456789abcdef\n')
f.close()
f = open('amazon_results', 'a')
f.write('0123456789abcdef')
f.close()

In [15]:
np.set_printoptions(precision=4)
print("depth,estimator,stump/train,stump/score,stump_time,ada/train,ada/score,ada_time")
estimators = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
for depth in range (1,12):
    for estimator in estimators:
        f = open('amazon_results', 'a')
        #stump
        dt_stump = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=1)
        start_stump_time = time()
        dt_stump.fit(x_train, y_train)
        end_stump_time = time() - start_stump_time
        stump_score_train = dt_stump.score(x_train,y_train.values.ravel())
        stump_score_test = dt_stump.score(x_test, y_test)
        
        #ada boost
        ada = AdaBoostClassifier(base_estimator=dt_stump,n_estimators=estimator)
        start_ada_time = time()
        ada.fit(x_train, y_train.values.ravel())
        end_ada_time = time() - start_ada_time
        
        ada_score_train = ada.score(x_train, y_train)
        ada_score_test = ada.score(x_test, y_test)
        print(depth,",", estimator, "," , stump_score_train , "," , stump_score_test , "," , end_stump_time , ",",
              ada_score_train , "," , ada_score_test, "," , end_ada_time)
        f.write(str(depth) + "," + str(estimator) + "," + str(stump_score_train) + "," + str(stump_score_test) + "," 
                + str(end_stump_time) + "," + str(ada_score_train) + "," + str(ada_score_test) + "," + str(end_ada_time) + '\n')
        f.close()

depth,estimator,stump/train,stump/score,stump_time,ada/train,ada/score,ada_time
1 , 50 , 0.583456111617 , 0.579394903585 , 0.5336289405822754 , 0.633885435869 , 0.628521271635 , 31.354063987731934
1 , 100 , 0.583456111617 , 0.579394903585 , 0.5314569473266602 , 0.64469906186 , 0.639383383164 , 60.146493911743164
1 , 150 , 0.583456111617 , 0.579394903585 , 0.6959149837493896 , 0.651085816881 , 0.645939051485 , 93.26130986213684
1 , 200 , 0.583456111617 , 0.579394903585 , 0.5205800533294678 , 0.656115729097 , 0.650300353842 , 122.62451100349426
1 , 250 , 0.583456111617 , 0.579394903585 , 0.5084760189056396 , 0.658781445517 , 0.653317607044 , 144.9873969554901
1 , 300 , 0.583456111617 , 0.579394903585 , 0.557887077331543 , 0.661721271595 , 0.655402254711 , 179.11492490768433
1 , 350 , 0.583456111617 , 0.579394903585 , 0.5672180652618408 , 0.662954765054 , 0.656389719395 , 210.70424795150757
1 , 400 , 0.583456111617 , 0.579394903585 , 0.49732303619384766 , 0.664386988015 , 0.657404613654 ,

In [11]:
# cross validation
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

dt_stump = DecisionTreeClassifier(max_depth=3, min_samples_leaf=1)
ada = AdaBoostClassifier(base_estimator=dt_stump,n_estimators=400)
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(ada, x_train, y_train.values.ravel(), cv=cv)

print(scores.mean())


0.646725599031
