# Data Splitting

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

# Importing & Sorting Data 

In [None]:
customerData = pd.read_csv('dataset\customerData_Clean.csv')

In [28]:
customerData.head()

Unnamed: 0.1,Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,...,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,MntGroceryProducts,TotalPurchase,AcceptedCmp,YearRange,TotalChild,HaveChild
0,0,1957,Graduation,Single,58138.0,0,0,58,635,88,...,8,10,4,7,806,25,1,1950,0,0
1,1,1954,Graduation,Single,46344.0,1,1,38,11,1,...,1,1,2,5,9,6,0,1950,2,1
2,2,1965,Graduation,Married,71613.0,0,0,26,426,49,...,8,2,10,4,287,21,0,1960,0,0
3,3,1984,Graduation,Married,26646.0,1,0,26,11,4,...,2,0,4,6,34,8,0,1980,1,1
4,4,1981,PhD,Married,58293.0,1,0,94,173,43,...,5,3,6,5,207,19,0,1980,1,1


In [10]:
# The originally cleaned dataset is sorted and saved as a separate csv file for later use.
sorted_customerData = customerData.sort_index(axis=1, ascending=True)
# sorted_customerData.head()
sorted_customerData.to_csv('dataset/sorted_customerData.csv')

### Splitting the Train and Test sets for all variables

In [33]:
from sklearn.model_selection import train_test_split

def normal_splitting(X, Y):
    # Split the Dataset into random Train and Test
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, shuffle=False, random_state=100)

    # Check the sample sizes
    print("Train Set :", X_train.shape, Y_train.shape)
    print("Test Set  :", X_test.shape, Y_test.shape)
#     print(X_train.info())
#     print(Y_train.info())
    
    train_data = pd.concat([X_train, Y_train], axis = 1)
    test_data = pd.concat([X_test, Y_test], axis = 1)
    
    return X_train, Y_train, X_test, Y_test, train_data, test_data
#     return X_train, Y_train, X_test, Y_test

In [None]:
# response_var = ["MntGroceryProducts", "TotalPurchase", "MntWines", "MntGoldProds"]
# cat_predictors = ["Education", "Marital_Status", "HaveChild", "YearRange"]
# num_predictors = ["Income", "TotalChild", "NumWebVisitsMonth"]

In [18]:
# Extract Response and Predictors
predictors = sorted_customerData[
                    list(sorted_customerData.loc[:,'Education':'Income']) + 
                    ['Marital_Status'] + ['NumWebVisitsMonth'] + 
                    ['TotalChild'] + ['YearRange']]
education = pd.get_dummies(predictors['Education'], drop_first=True)
# # education.head()
marital_status = pd.get_dummies(predictors['Marital_Status'], drop_first=True)
# # marital_status.head()
have_child = pd.get_dummies(predictors['HaveChild'], drop_first=True)
# # have_child.head()
year_range = pd.get_dummies(predictors['YearRange'], drop_first=True)
# year_range.head()

# for var in cat_predictors:
#     print(var)
#     var = pd.get_dummies(predictors[var], drop_first=True)
#     print(var)
#     print()

predictors = predictors.drop(['Education', 'Marital_Status', 'HaveChild', 'YearRange'], axis=1)

# for var in cat_predictors:
#     predictors = pd.concat([predictors, var], axis=1)
predictors = pd.concat([predictors, education, marital_status, have_child, year_range], axis=1)
# predictors.head()
responses = sorted_customerData[
    list(sorted_customerData.loc[:, 'MntGoldProds':'MntGroceryProducts']) +
    ['MntWines'] + ['TotalPurchase']]
responses.head()

Unnamed: 0,MntGoldProds,MntGroceryProducts,MntWines,TotalPurchase
0,88,806,635,25
1,6,9,11,6
2,42,287,426,21
3,5,34,11,8
4,15,207,173,19


In [35]:
X_train, Y_train, X_test, Y_test, train_data, test_data = normal_splitting(predictors, responses)

Train Set : (1732, 15) (1732, 4)
Test Set  : (434, 15) (434, 4)


In [37]:
train_data.to_csv('dataset/new_train_data.csv')
test_data.to_csv('dataset/new_test_data.csv')

In [31]:
X_train.to_csv('dataset/X_train_lr.csv')
Y_train.to_csv('dataset/Y_train_lr.csv')

In [32]:
X_test.to_csv('dataset/X_test_lr.csv')
Y_test.to_csv('dataset/Y_test_lr.csv')

## K-Fold Cross Validation (link to LR notebook)

In [22]:
from sklearn.metrics import SCORERS

sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

In [26]:
test_data.head()

Unnamed: 0,Year_Birth,Education,Marital_Status,Complain,NumWebVisitsMonth,YearRange,HaveChild,AcceptedCmp,Income,Kidhome,...,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,MntGroceryProducts,TotalPurchase,TotalChild
1751,1959,Graduation,Married,0,7,1950,1,0,71232.0,0,...,34,26,17,2,11,2,10,221,25,1
1752,1971,Graduation,Married,0,8,1970,1,1,34600.0,1,...,8,3,15,5,5,2,5,101,17,2
1753,1967,Graduation,Single,0,8,1960,1,0,46904.0,1,...,0,9,31,4,5,1,4,60,14,2
1754,1973,Graduation,Married,0,6,1970,1,0,49094.0,0,...,11,8,69,5,6,3,6,49,20,1
1755,1983,Graduation,Married,0,6,1980,1,0,36075.0,1,...,12,1,30,1,2,0,4,42,7,1


In [27]:
train_data.to_csv('dataset/new_train_data.csv')

In [28]:
test_data.to_csv('dataset/new_test_data.csv')

In [62]:
# # from sklearn.model_selection import KFold

# X = pd.DataFrame(train_data[['Year_Birth', 'Education', 'Marital_Status', 'Complain', 
#                                  'NumWebVisitsMonth', 'YearRange', 'HaveChild', 'AcceptedCmp']])
# Y = pd.DataFrame(train_data[['Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
#                                   'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 
#                                   'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
#                                  'MntGroceryProducts', 'TotalPurchase', 'TotalChild']])

# kf = KFold(n_splits=2)

# for train_index, test_index in kf.split(X, Y):
# #     print("TRAIN:", train_index) 
# #     print("TEST:", test_index)
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    
# train_data = pd.concat([X_train, Y_train])
# cv_data = pd.concat([X_test, Y_test])
# train_data.info()

In [12]:
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error

# # Extract Response and Predictors
# # Y = pd.DataFrame(train_data['MntGroceryProducts']) # Response
# # X = pd.DataFrame(train_data['Income']) # Predictor

# Y = pd.DataFrame(train_data[['MntWines', 'MntFruits','MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 
#                                'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 
#                                'NumStorePurchases','MntGroceryProducts', 'TotalPurchase']]) # Response
# X = pd.DataFrame(train_data[['Year_Birth', 'Education', 'Marital_Status', 'Complain', 
#                                  'NumWebVisitsMonth', 'YearRange', 'HaveChild', 'AcceptedCmp',
#                               'Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'TotalChild']]) # Predictor

# # X, Y = train_data.iloc[:, :-1], train_data.iloc[:, -1]

# # Split the Dataset into random Train and Test
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# # Check the sample sizes
# print("Train Set :", X_train.shape, Y_train.shape)
# print("Test Set  :", X_test.shape, Y_test.shape)

# X_train.info()

Train Set : (1400, 14) (1400, 12)
Test Set  : (351, 14) (351, 12)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1400 entries, 202 to 684
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year_Birth         1400 non-null   int64  
 1   Education          1400 non-null   object 
 2   Marital_Status     1400 non-null   object 
 3   Complain           1400 non-null   int64  
 4   NumWebVisitsMonth  1400 non-null   int64  
 5   YearRange          1400 non-null   int64  
 6   HaveChild          1400 non-null   int64  
 7   AcceptedCmp        1400 non-null   int64  
 8   Income             1400 non-null   float64
 9   Kidhome            1400 non-null   int64  
 10  Teenhome           1400 non-null   int64  
 11  Dt_Customer        1400 non-null   object 
 12  Recency            1400 non-null   int64  
 13  TotalChild         1400 non-null   int64  
dtypes: float64(1), int64(10), object(3)
memory usage: 164

In [11]:
X_train.head()

Unnamed: 0,Year_Birth,Education,Marital_Status,Complain,NumWebVisitsMonth,YearRange,HaveChild,AcceptedCmp,Income,Kidhome,Teenhome,Dt_Customer,Recency,TotalChild
202,1956,Graduation,Married,0,4,1950,1,0,46097.0,0,1,31-03-2013,11,1
1287,1989,2n Cycle,Married,0,4,1980,1,0,59060.0,1,0,11-01-2014,77,1
198,1988,Graduation,Married,0,9,1980,1,1,29604.0,1,0,08-12-2013,88,1
1097,1973,Master,Divorced,0,7,1970,1,0,52034.0,1,1,17-05-2013,67,2
1671,1989,Graduation,Single,0,7,1980,1,0,18358.0,1,0,20-11-2013,49,1


In [13]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 351 entries, 871 to 133
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year_Birth         351 non-null    int64  
 1   Education          351 non-null    object 
 2   Marital_Status     351 non-null    object 
 3   Complain           351 non-null    int64  
 4   NumWebVisitsMonth  351 non-null    int64  
 5   YearRange          351 non-null    int64  
 6   HaveChild          351 non-null    int64  
 7   AcceptedCmp        351 non-null    int64  
 8   Income             351 non-null    float64
 9   Kidhome            351 non-null    int64  
 10  Teenhome           351 non-null    int64  
 11  Dt_Customer        351 non-null    object 
 12  Recency            351 non-null    int64  
 13  TotalChild         351 non-null    int64  
dtypes: float64(1), int64(10), object(3)
memory usage: 41.1+ KB


In [50]:
Y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1750 entries, 542 to 52
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   MntWines             876 non-null    float64
 1   MntFruits            876 non-null    float64
 2   MntMeatProducts      876 non-null    float64
 3   MntFishProducts      876 non-null    float64
 4   MntSweetProducts     876 non-null    float64
 5   MntGoldProds         876 non-null    float64
 6   NumDealsPurchases    876 non-null    float64
 7   NumWebPurchases      876 non-null    float64
 8   NumCatalogPurchases  876 non-null    float64
 9   NumStorePurchases    876 non-null    float64
 10  MntGroceryProducts   876 non-null    float64
 11  TotalPurchase        876 non-null    float64
dtypes: float64(12)
memory usage: 177.7 KB


In [51]:
Y_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 438 entries, 45 to 377
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   MntWines             218 non-null    float64
 1   MntFruits            218 non-null    float64
 2   MntMeatProducts      218 non-null    float64
 3   MntFishProducts      218 non-null    float64
 4   MntSweetProducts     218 non-null    float64
 5   MntGoldProds         218 non-null    float64
 6   NumDealsPurchases    218 non-null    float64
 7   NumWebPurchases      218 non-null    float64
 8   NumCatalogPurchases  218 non-null    float64
 9   NumStorePurchases    218 non-null    float64
 10  MntGroceryProducts   218 non-null    float64
 11  TotalPurchase        218 non-null    float64
dtypes: float64(12)
memory usage: 44.5 KB
