In [1]:
from sklearn import model_selection
from sklearn import (svm, 
                     neighbors, 
                     ensemble, 
                    linear_model, 
                    naive_bayes, 
                    tree, 
                    discriminant_analysis)

In [3]:
import pandas as pd

In [2]:
import time

In [9]:
data = pd.read_csv('./data/preprocessed_train_data.csv')

In [17]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 13 columns):
Unnamed: 0       913000 non-null int64
date             913000 non-null object
store            913000 non-null int64
item             913000 non-null int64
sales            913000 non-null int64
Datetime_date    913000 non-null object
year             913000 non-null int64
month            913000 non-null int64
day              913000 non-null int64
weekday          913000 non-null int64
isWeekend        913000 non-null int64
week             913000 non-null int64
isHoliday        913000 non-null int64
dtypes: int64(11), object(2)
memory usage: 90.6+ MB
None


# Baseline Models

In [10]:
print("Baseline Model:", 1 / len(data['sales'].unique().tolist()))

Baseline Model: 0.004694835680751174


# Train Test Split

In [40]:
sample_data = data.sample(20000)

In [41]:
target = ['sales']
features = ['store', 'item', 'year', 'month', 'week', 'isHoliday', 'day', 'isWeekend', 'weekday']

In [42]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(sample_data[features], sample_data[target], random_state=0)
train_y = train_y['sales']
test_y = test_y['sales']

In [43]:
# Dataframe for the results list of MLA algorithms that will be tested
MLA_Compare = pd.DataFrame(columns=['Name', 'Train_Score', 'Test_Score', 'Time', 'Parameters'])

In [44]:
# List of MLA Algorithms
MLA = [
    # ensemble
    ensemble.AdaBoostClassifier(),
    ensemble.AdaBoostRegressor(),
    ensemble.RandomForestClassifier(), 
    ensemble.RandomForestRegressor(),
    ensemble.BaggingClassifier(),
    ensemble.GradientBoostingRegressor(),
    ensemble.ExtraTreesRegressor(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    # svm
    svm.LinearSVR(), 
    svm.SVR(), 
    svm.NuSVR(),
    
    
    # tree
    tree.DecisionTreeClassifier(),
    tree.DecisionTreeRegressor(),    
]

In [45]:
def ml_training(MLA, pd_dataframe):
    
    row_number = 0
    for alg in MLA:
        alg_name = alg.__class__.__name__
        print("starting:", alg_name)

        start_time = time.time()
        alg.fit(train_X, train_y)
        end_time = time.time()
        time_taken = end_time - start_time

        train_score = alg.score(train_X, train_y)
        test_score = alg.score(test_X, test_y)

        # add to pandas dataframe
        pd_dataframe.loc[row_number] = [alg_name, train_score, test_score, time_taken, alg.get_params()]
        row_number+=1
        
    pd_dataframe.sort_values(by=['Test_Score'], ascending=False, inplace=True)
    print('done')
    return pd_dataframe

In [None]:
MLA_Compare = ml_training(MLA, MLA_Compare)

starting: AdaBoostClassifier
starting: AdaBoostRegressor
starting: RandomForestClassifier
starting: RandomForestRegressor
starting: BaggingClassifier
starting: GradientBoostingRegressor
starting: ExtraTreesRegressor
starting: KNeighborsClassifier
starting: LinearSVR
starting: SVR


In [None]:
MLA_Compare