# Data Splitting

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

# Importing & Sorting Data 

In [2]:
customerData = pd.read_csv('dataset\customerData_Clean.csv')

In [3]:
customerData.head()

Unnamed: 0.1,Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,...,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,MntGroceryProducts,TotalPurchase,YearRange,TotalChild,HaveChild
0,0,1957,Graduation,Single,58138.0,0,0,58,635,88,...,3,8,10,4,7,806,25,1950,0,0
1,1,1954,Graduation,Single,46344.0,1,1,38,11,1,...,2,1,1,2,5,9,6,1950,2,1
2,2,1965,Graduation,Married,71613.0,0,0,26,426,49,...,1,8,2,10,4,287,21,1960,0,0
3,3,1984,Graduation,Married,26646.0,1,0,26,11,4,...,2,2,0,4,6,34,8,1980,1,1
4,4,1981,PhD,Married,58293.0,1,0,94,173,43,...,5,5,3,6,5,207,19,1980,1,1


In [4]:
customerData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166 entries, 0 to 2165
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           2166 non-null   int64  
 1   Year_Birth           2166 non-null   int64  
 2   Education            2166 non-null   object 
 3   Marital_Status       2166 non-null   object 
 4   Income               2166 non-null   float64
 5   Kidhome              2166 non-null   int64  
 6   Teenhome             2166 non-null   int64  
 7   Recency              2166 non-null   int64  
 8   MntWines             2166 non-null   int64  
 9   MntFruits            2166 non-null   int64  
 10  MntMeatProducts      2166 non-null   int64  
 11  MntFishProducts      2166 non-null   int64  
 12  MntSweetProducts     2166 non-null   int64  
 13  MntGoldProds         2166 non-null   int64  
 14  NumDealsPurchases    2166 non-null   int64  
 15  NumWebPurchases      2166 non-null   i

In [10]:
# The originally cleaned dataset is sorted and saved as a separate csv file for later use.
sorted_customerData = customerData.sort_index(axis=1, ascending=True)
# sorted_customerData.head()
sorted_customerData.to_csv('dataset/sorted_customerData.csv')

# Splitting Data

## train_test_split

Variables in dataset to be splitted:

1. __Responses (Y):__
    - MntGroceryProducts
    - TotalPurchase
    - MntWines
    - MntGoldProds


2. __Predictors (X):__
    - Categorical:
        - Education
        - Marital_Status
        - HaveChild
        - YearRange
    - Numerical:
        - Income
        - TotalChild
        - NumWebVisitsMonth

In [40]:
from sklearn.model_selection import train_test_split

def normal_splitting(X, Y):
    # Split the dataset containing predictor and response variables into fixed Train and Test
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, shuffle=False, random_state=100)

    # Check the sample sizes
    print("Train Set :", X_train.shape, Y_train.shape)
    print("Test Set  :", X_test.shape, Y_test.shape)
#     print(X_train.info())
#     print(Y_train.info())
    
    train_data = pd.concat([X_train, Y_train], axis = 1)
    test_data = pd.concat([X_test, Y_test], axis = 1)
    
    return X_train, Y_train, X_test, Y_test, train_data, test_data

In [41]:
predictors = sorted_customerData[
                    list(sorted_customerData.loc[:,'Education':'Income']) + 
                    ['Marital_Status'] + ['NumWebVisitsMonth'] + 
                    ['TotalChild'] + ['YearRange']]
# predictors.head()

responses = sorted_customerData[
    list(sorted_customerData.loc[:, 'MntGoldProds':'MntGroceryProducts']) +
    ['MntWines'] + ['TotalPurchase']]
# responses.head()

X_train, Y_train, X_test, Y_test, train_data_pr, test_data_pr = normal_splitting(predictors, responses)

Train Set : (1732, 7) (1732, 4)
Test Set  : (434, 7) (434, 4)


In [None]:
train_data_pr.to_csv('dataset/Poisson_Regression/train_data_pr.csv')
test_data_pr.to_csv('dataset/Poisson_Regression/test_data_pr.csv')

In [18]:
# Extract Response and Predictors for splitting
predictors = sorted_customerData[
                    list(sorted_customerData.loc[:,'Education':'Income']) + 
                    ['Marital_Status'] + ['NumWebVisitsMonth'] + 
                    ['TotalChild'] + ['YearRange']]

# Encoding categorial predictor variables using get_dummies.
education = pd.get_dummies(predictors['Education'], drop_first=True)
# # education.head()
marital_status = pd.get_dummies(predictors['Marital_Status'], drop_first=True)
# # marital_status.head()
have_child = pd.get_dummies(predictors['HaveChild'], drop_first=True)
# # have_child.head()
year_range = pd.get_dummies(predictors['YearRange'], drop_first=True)
# year_range.head()

# Dropping unencoded categorical predictor variables
predictors = predictors.drop(['Education', 'Marital_Status', 'HaveChild', 'YearRange'], axis=1)

# Adding encoded categorical predictor variables to predictors
predictors = pd.concat([predictors, education, marital_status, have_child, year_range], axis=1)

# predictors.head()

responses = sorted_customerData[
    list(sorted_customerData.loc[:, 'MntGoldProds':'MntGroceryProducts']) +
    ['MntWines'] + ['TotalPurchase']]
# responses.head()

Unnamed: 0,MntGoldProds,MntGroceryProducts,MntWines,TotalPurchase
0,88,806,635,25
1,6,9,11,6
2,42,287,426,21
3,5,34,11,8
4,15,207,173,19


In [35]:
# Perform train_test_split on dataset containing predictors and responses
X_train, Y_train, X_test, Y_test, train_data, test_data = normal_splitting(predictors, responses)

Train Set : (1732, 15) (1732, 4)
Test Set  : (434, 15) (434, 4)


# Exporting Splitted Data

In [44]:
train_data_lr.to_csv('dataset/Linear_Regression/train_data_lr.csv')
test_data_lr.to_csv('dataset/Linear_Regression/test_data_lr.csv')

In [31]:
X_train.to_csv('dataset/Linear_Regression/X_train_lr.csv')
Y_train.to_csv('dataset/Linear_Regression/Y_train_lr.csv')

In [32]:
X_test.to_csv('dataset/Linear_Regression/X_test_lr.csv')
Y_test.to_csv('dataset/Linear_Regression/Y_test_lr.csv')

## Cross Validation (link to LR notebook)

In [22]:
from sklearn.metrics import SCORERS

sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we