In [1]:
from sklearn import model_selection, metrics
from sklearn import (svm, 
                     neighbors, 
                     ensemble, 
                    linear_model, 
                    naive_bayes, 
                    tree, 
                    discriminant_analysis)

In [2]:
import pandas as pd

In [3]:
import time

In [4]:
train_data = pd.read_csv('./data/preprocessed_train_data.csv')
test_data = pd.read_csv('./data/preprocessed_test_data.csv')

In [5]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 18 columns):
Unnamed: 0                 913000 non-null int64
date                       913000 non-null object
store                      913000 non-null int64
item                       913000 non-null int64
sales                      913000 non-null int64
Datetime_date              913000 non-null object
year                       913000 non-null int64
month                      913000 non-null int64
day                        913000 non-null int64
weekday                    913000 non-null int64
week_of_year               913000 non-null int64
isWeekend                  913000 non-null int64
isHoliday                  913000 non-null int64
store_sales_sum            913000 non-null int64
store_monthly_sales_sum    913000 non-null int64
store_item_sales_sum       913000 non-null int64
store_item_sales_median    913000 non-null float64
item_sales_sum             913000 non-null int64
dty

# Baseline Models

In [6]:
print("Baseline Model:", 1 / len(train_data['sales'].unique().tolist()))

Baseline Model: 0.004694835680751174


# Train Test Split

In [7]:
sample_data = train_data.sample(20000)

In [8]:
target = ['sales']
features = ['store', 'item', 
            'year', 'month', 'day',
            'weekday', 'week_of_year','isWeekend', 'isHoliday',
            'store_sales_sum', 'store_monthly_sales_sum',
            'store_item_sales_sum', 'store_item_sales_median', 
            'item_sales_sum'
           ]

In [9]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(sample_data[features], sample_data[target], random_state=0)
train_y = train_y['sales']
test_y = test_y['sales']

In [10]:
# List of MLA Algorithms
MLA = [
    # ensemble
    ensemble.AdaBoostClassifier(),
    ensemble.AdaBoostRegressor(),
    ensemble.RandomForestClassifier(), 
    ensemble.RandomForestRegressor(),
    ensemble.BaggingClassifier(),
    ensemble.GradientBoostingRegressor(),
    ensemble.ExtraTreesRegressor(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    # svm
    svm.LinearSVR(), 
    svm.SVR(), 
    svm.NuSVR(),
    
    
    # tree
    tree.DecisionTreeClassifier(),
    tree.DecisionTreeRegressor(),    
]

In [11]:
def ml_training(MLA):
    pd_dataframe = pd.DataFrame(columns=['Name', 'Train_Score', 'Test_Score', 'Time', 'Parameters'])
    row_number = 0
    for alg in MLA:
        alg_name = alg.__class__.__name__
        print("starting:", alg_name)

        start_time = time.time()
        alg.fit(train_X, train_y)
        end_time = time.time()
        time_taken = end_time - start_time

        train_score = alg.score(train_X, train_y)
        test_score = alg.score(test_X, test_y)

        # add to pandas dataframe
        pd_dataframe.loc[row_number] = [alg_name, train_score, test_score, time_taken, alg.get_params()]
        row_number+=1
        
    pd_dataframe.sort_values(by=['Test_Score'], ascending=False, inplace=True)
    print('done')
    return pd_dataframe

In [12]:
MLA_Compare = ml_training(MLA)

starting: AdaBoostClassifier
starting: AdaBoostRegressor
starting: RandomForestClassifier
starting: RandomForestRegressor
starting: BaggingClassifier
starting: GradientBoostingRegressor
starting: ExtraTreesRegressor
starting: KNeighborsClassifier
starting: LinearSVR
starting: SVR
starting: NuSVR
starting: DecisionTreeClassifier
starting: DecisionTreeRegressor
done


In [13]:
MLA_Compare

Unnamed: 0,Name,Train_Score,Test_Score,Time,Parameters
5,GradientBoostingRegressor,0.938103,0.937171,1.352149,"{'alpha': 0.9, 'criterion': 'friedman_mse', 'i..."
3,RandomForestRegressor,0.985322,0.924343,0.943507,"{'bootstrap': True, 'criterion': 'mse', 'max_d..."
6,ExtraTreesRegressor,1.0,0.920069,0.645207,"{'bootstrap': False, 'criterion': 'mse', 'max_..."
1,AdaBoostRegressor,0.895708,0.897941,1.286327,"{'base_estimator': None, 'learning_rate': 1.0,..."
12,DecisionTreeRegressor,1.0,0.861496,0.158909,"{'criterion': 'mse', 'max_depth': None, 'max_f..."
8,LinearSVR,0.681426,0.698091,2.802752,"{'C': 1.0, 'dual': True, 'epsilon': 0.0, 'fit_..."
4,BaggingClassifier,0.990733,0.0446,3.004971,"{'base_estimator': None, 'bootstrap': True, 'b..."
11,DecisionTreeClassifier,1.0,0.0436,0.441736,"{'class_weight': None, 'criterion': 'gini', 'm..."
2,RandomForestClassifier,0.988467,0.0402,1.282248,"{'bootstrap': True, 'class_weight': None, 'cri..."
0,AdaBoostClassifier,0.036733,0.0322,5.755534,"{'algorithm': 'SAMME.R', 'base_estimator': Non..."


In [15]:
best_clf = ensemble.GradientBoostingRegressor()
best_clf.fit(train_X, train_y)
best_clf.score(test_X, test_y)

0.9371618116803591

# Submission

In [26]:
predictions = best_clf.predict(test_data[features])
predictions = predictions.astype(int)
pandas_id = [i for i in range(len(predictions))]

In [28]:
print(predictions)
print(f'{len(pandas_id)} {len(predictions)}')

[12 15 15 ... 29 31 32]
45000 45000


In [20]:
pd.read_csv('./data/sample_submission.csv.zip').head()

Unnamed: 0,id,sales
0,0,52
1,1,52
2,2,52
3,3,52
4,4,52


In [29]:
submission_dataframe = pd.DataFrame({'id':pandas_id, 'sales':predictions})

In [32]:
submission_dataframe.tail()

Unnamed: 0,id,sales
44995,44995,27
44996,44996,27
44997,44997,29
44998,44998,31
44999,44999,32


In [34]:
submission_dataframe.to_csv('./data/sumbission.csv')