In [None]:
import os
import joblib
# Core
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
from imblearn.over_sampling import SMOTE
import itertools
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import time

# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
# from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# import eli5
# from eli5.sklearn import PermutationImportance
from sklearn.utils import resample

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Get path from training and test dataset

curr_path = os.getcwd()
dataset_src = os.path.join(curr_path, 'dataset')

sales_train = pd.read_csv(os.path.join(dataset_src, 'sales_train.csv'))
test = pd.read_csv(os.path.join(dataset_src, 'test.csv'))
item_categories = pd.read_csv(os.path.join(dataset_src, 'item_categories.csv'))
items = pd.read_csv(os.path.join(dataset_src, 'items.csv'))
shops = pd.read_csv(os.path.join(dataset_src, 'shops.csv'))
sample_submission = pd.read_csv(os.path.join(dataset_src, 'sample_submission.csv'))


In [None]:
# Missing values?

sales_train.info()

print(sales_train.isnull().sum())
print('-----------')
print('-----------')
print(test.isnull().sum())
print('-----------')
print('-----------')
print(item_categories.isnull().sum())
print('-----------')
print('-----------')
print(items.isnull().sum())
print('-----------')
print('-----------')
print(shops.isnull().sum())

In [None]:
print(sales_train)
print('---------------------------------------------')
print(test)
print('---------------------------------------------')
print(item_categories)
print('---------------------------------------------')
print(items)
print('---------------------------------------------')
print(shops)
print('---------------------------------------------')
print(sample_submission)

In [None]:
sales_train


In [1]:
list_features = sales_train.columns
print('They are',len(list_features),'features in the dataset.')
print('----------------')
for f in list_features:
    print('feature:', f, '|| Type:', type(sales_train[f][0]), '|| Example:', sales_train[f][0], '|| number of unique values', len(sales_train[f].unique()) )

NameError: name 'sales_train' is not defined

In [170]:
#total sales
sales_train[['item_cnt_day']].sum()

item_cnt_day    3648206.0
dtype: float64

In [171]:
# sales per date 

sales_train.groupby(['date']).item_cnt_day.sum()

date
01.01.2013     1951.0
01.01.2014     2310.0
01.01.2015     2117.0
01.02.2013     3817.0
01.02.2014     5711.0
               ...   
31.10.2013     3826.0
31.10.2014     3014.0
31.10.2015     3104.0
31.12.2013    10514.0
31.12.2014    11394.0
Name: item_cnt_day, Length: 1034, dtype: float64

In [161]:
# sales by shop_id

sales_train.groupby(['shop_id']).item_cnt_day.sum().sort_values(ascending=False).to_frame()

Unnamed: 0_level_0,item_cnt_day
shop_id,Unnamed: 1_level_1
31,310777.0
25,241920.0
54,185790.0
28,184557.0
42,144934.0
57,141107.0
27,136657.0
6,100489.0
58,81734.0
46,78990.0


In [176]:
# sales by item
sales_train.groupby(['item_id']).item_cnt_day.sum().sort_values(ascending=False).sort_values(ascending=False).to_frame()

Unnamed: 0_level_0,item_cnt_day
item_id,Unnamed: 1_level_1
20949,187642.0
2808,17245.0
3732,16642.0
17717,15830.0
5822,14515.0
...,...
7547,0.0
13474,-1.0
18062,-1.0
11871,-1.0


In [175]:
sales_train.groupby(['shop_id', 'item_id']).item_cnt_day.sum()
    

shop_id  item_id
0        30         31.0
         31         11.0
         32         16.0
         33          6.0
         35         15.0
                    ... 
59       22154       1.0
         22155       1.0
         22162      16.0
         22164       6.0
         22167       4.0
Name: item_cnt_day, Length: 424124, dtype: float64

In [157]:
shops[['shop_id', 'shop_name']]

Unnamed: 0,shop_id,shop_name
0,0,"!Якутск Орджоникидзе, 56 фран"
1,1,"!Якутск ТЦ ""Центральный"" фран"
2,2,"Адыгея ТЦ ""Мега"""
3,3,"Балашиха ТРК ""Октябрь-Киномир"""
4,4,"Волжский ТЦ ""Волга Молл"""
5,5,"Вологда ТРЦ ""Мармелад"""
6,6,"Воронеж (Плехановская, 13)"
7,7,"Воронеж ТРЦ ""Максимир"""
8,8,"Воронеж ТРЦ Сити-Парк ""Град"""
9,9,Выездная Торговля


In [None]:
sales_train.groupby(['item_price']).item_cnt_day.sum().sort_values(ascending=False)

In [None]:
# Define classifiers

# Classifiers
classifiers = {
    # "LogisticRegression" : LogisticRegression(random_state=0),
    # "KNN" : KNeighborsClassifier(),
    # "SVC" : SVC(random_state=0, probability=True),
    # "RandomForest" : RandomForestClassifier(random_state=0),
    # "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
    # "XGBoost" : XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss'), # XGBoost takes too long
    # "LGBM" : LGBMClassifier(random_state=0, force_col_wise=True),
    "CatBoost" : CatBoostClassifier(random_state=0, verbose=False),
    # "NaiveBayes": GaussianNB()
}

# Grids for grid search
LR_grid = {'penalty': ['l1','l2'],
           'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
           'max_iter': [50, 100, 150]}

KNN_grid = {'n_neighbors': [3, 5, 7, 9],
            'p': [1, 2]}

SVC_grid = {'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']}

RF_grid = {'n_estimators': [50, 100, 150, 200, 250, 300],
        'max_depth': [4, 6, 8, 10, 12]}

boosted_grid = {'n_estimators': [50, 100, 150, 200],
        'max_depth': [4, 8, 12],
        'learning_rate': [0.05, 0.1, 0.15]}

NB_grid={'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7]}

# Dictionary of all grids
grid = {
    "LogisticRegression" : LR_grid,
    "KNN" : KNN_grid,
    "SVC" : SVC_grid,
    "RandomForest" : RF_grid,
    "GradientBoostingClassifier": boosted_grid,
    "XGBoost" : boosted_grid,
    "LGBM" : boosted_grid,
    "CatBoost" : boosted_grid,
    "NaiveBayes": NB_grid
}

In [None]:
'''
Train and evaluate models

Train models with grid search (but no cross validation so it doesn't take too long) to get a rough idea of which are the best models for this dataset.
'''

i=0
clf_best_params=classifiers.copy()
valid_scores=pd.DataFrame({'Classifer':classifiers.keys(), 'Validation accuracy': np.zeros(len(classifiers)), 'Training time': np.zeros(len(classifiers))})
for key, classifier in classifiers.items():
    start = time.time()
    clf = GridSearchCV(estimator=classifier, param_grid=grid[key], n_jobs=8, cv=None, verbose=2)

    # Train and score
    clf.fit(X_train, y_train)
    valid_scores.iloc[i,1]=clf.score(X_valid, y_valid)

    # Save trained model
    clf_best_params[key]=clf.best_params_
    
    # Print iteration and training time
    stop = time.time()
    valid_scores.iloc[i,2]=np.round((stop - start)/60, 2)
    
    print('Model:', key)
    print('Training time (mins):', valid_scores.iloc[i,2])
    print('')
    i+=1

In [None]:
# Show results
valid_scores

In [None]:
'''
Motivated by this, we will take LGBM and CatBoost to the final stage of modelling.


'''

# Show best parameters from grid search
clf_best_params

In [None]:
'''
Modelling
We can finally train our best model on the whole training set using cross validation and ensembling predictions 
together to produce the most confident predictions.

Define best models
'''

# Classifiers
best_classifiers = {
    # "XGBoost" : XGBClassifier(**clf_best_params["XGBoost"], random_state=0),
    "CatBoost" : CatBoostClassifier(**clf_best_params["CatBoost"], verbose=2, random_state=0),
    # "LGBM": GradientBoostingClassifier(**clf_best_params["LGBM"], verbose=2, random_state=0)
}

In [None]:
'''
Cross validation and ensembling predictions

Predictions are ensembled together using soft voting. This averages the predicted probabilies to produce the most confident predictions.
'''

# Number of folds in cross validation
FOLDS=10

preds=np.zeros(len(X_test))
for key, classifier in best_classifiers.items():
    start = time.time()
    
    # 10-fold cross validation
    cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)
    
    score=0
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        # Get training and validation sets
        X_train, X_valid = X[train_idx], X[val_idx]
        y_train, y_valid = y[train_idx], y[val_idx]

        # Train model
        clf = classifier
        clf.fit(X_train, y_train)

        # Make predictions and measure accuracy
        preds += clf.predict_proba(X_test)[:,1]
        score += clf.score(X_valid, y_valid)

    # Average accuracy    
    score=score/FOLDS
    
    # Stop timer
    stop = time.time()

    # Print accuracy and time
    print('Model:', key)
    print('Average validation accuracy:', np.round(100*score,2))
    print('Training time (mins):', np.round((stop - start)/60,2))
    print('')
    
# Ensemble predictions
preds=preds/(FOLDS*len(best_classifiers))

In [None]:
# Proportion of predicted positive (transported) classes
def preds_prop(preds_arr, thresh):
    pred_classes=(preds_arr>=thresh).astype(int)
    return pred_classes.sum()/len(pred_classes)

# Plot proportions across a range of thresholds
def plot_preds_prop(preds_arr):
    # Array of thresholds
    T_array=np.arange(0,1,0.001)
    
    # Calculate proportions
    prop=np.zeros(len(T_array))
    for i, T in enumerate(T_array):
        prop[i]=preds_prop(preds_arr, T)
        
    # Plot proportions
    plt.figure(figsize=(10,4))
    plt.plot(T_array, prop)
    target_prop=0.519         # Experiment with this value
    plt.axhline(y=target_prop, color='r', linestyle='--')
    plt.text(-0.02,0.45,f'Target proportion: {target_prop}', fontsize=14)
    plt.title('Predicted target distribution vs threshold')
    plt.xlabel('Threshold')
    plt.ylabel('Proportion')
    
    # Find optimal threshold (the one that leads to the proportion being closest to target_prop)
    T_opt=T_array[np.abs(prop-target_prop).argmin()]
    print('Optimal threshold:', T_opt)
    return T_opt
    
T_opt=plot_preds_prop(preds)

In [None]:
# Classify test set using optimal threshold
preds_tuned=(preds>=T_opt).astype(int)

In [None]:
# Sample submission (to get right format)
sub=pd.read_csv('./dataset//sample_submission.csv')

# Add predictions
sub['Transported']=preds_tuned

# Replace 0 to False and 1 to True
sub=sub.replace({0:False, 1:True})

# Prediction distribution
plt.figure(figsize=(6,6))
sub['Transported'].value_counts().plot.pie(explode=[0.1,0.1], autopct='%1.1f%%', shadow=True, textprops={'fontsize':16}).set_title("Prediction distribution")

In [None]:
# Output to csv
sub.to_csv('submission_8113CB.csv', index=False)

In [None]:
# with open('submission_8113CB.csv', 'r') as t1, open('submission_8117CB.csv', 'r') as t2:
#     fileone = t1.readlines()
#     filetwo = t2.readlines()

# with open('diff.csv', 'w') as outFile:
#     for line in filetwo:
#         if line not in fileone:
#             outFile.write(line)