## 4.0 Capstone Project Two: All State Purchase Prediction - Pre Processing and Training Data DEvelopment<a id='2_Exploratory_Data_Analysis'></a>
**Submitted By:** Amit Kukreja

## Objectives<a id='2.2_EDA_Objectives'></a>

1) Split the data into training and test set into separate csv files and only use training data going forward.

2) Make a data frame with one row per customer in the training set, that includes the data from the first two shopping points, and then the final shopping point as the target.

3) Try to write a function (or a class even better) that can take in any number of shopping points and make a dataframe with that number of shopping points 

4) Create dummy or indicator features for categorical variables.

5) Standardize the magnitude of numeric features using a scaler. 




In [40]:
from pathlib import Path

import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from pandas_profiling.utils.cache import cache_file
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sb_utils import save_file
from collections import defaultdict
from collections import Counter
from sklearn.model_selection import train_test_split


In [6]:
# Let's read the wide dataframe created in the data wrangling stage into a dataframe object
df_wide = pd.read_csv("WIP_data/df_horizontal_expand_ver3.csv")

df_wide.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous_13,duration_previous_13,A_13,B_13,C_13,D_13,E_13,F_13,G_13,cost_13
0,10000000,9,1,0,12:07,IN,10001,2,0,2,...,,,,,,,,,,
1,10000005,6,1,3,09:09,NY,10006,1,0,10,...,,,,,,,,,,
2,10000007,8,1,4,14:26,PA,10008,1,0,11,...,,,,,,,,,,
3,10000013,4,1,4,09:31,WV,10014,2,1,3,...,,,,,,,,,,
4,10000014,6,1,1,17:50,MO,10015,1,0,5,...,,,,,,,,,,


In [7]:
# Split the dataframe into training and test sets with test being 20% of the observations
# We use stratify on the shopping point column so that both training and test sets contain a similar proportion of all 
# shopping points. Straify is important here as for some shopping points e.g. 12 & 13, very few datapoints are available.

df_train, df_test = train_test_split(df_wide, test_size = 0.2, random_state = 123, stratify = df_wide['shopping_pt'])


In [8]:
df_train.shape, df_test.shape

((77607, 259), (19402, 259))

In [9]:
# Let's check proportion of different shopping pts in train dataset
count_vals_tr = pd.DataFrame(df_train['shopping_pt'].value_counts().sort_values(ascending=False)).reset_index()
count_vals_tr.columns = ['shopping_pt', '#']
count_vals_tr['%'] = np.round(count_vals_tr['#'] * 100 / np.sum(count_vals_tr['#']),2)
count_vals_tr

Unnamed: 0,shopping_pt,#,%
0,7,14872,19.16
1,8,13798,17.78
2,6,12498,16.1
3,9,9588,12.35
4,5,9015,11.62
5,4,6401,8.25
6,10,4857,6.26
7,3,4455,5.74
8,11,1703,2.19
9,12,380,0.49


In [10]:
# Let's check proportion of different shopping pts in test dataset
count_vals_te = pd.DataFrame(df_test['shopping_pt'].value_counts().sort_values(ascending=False)).reset_index()
count_vals_te.columns = ['shopping_pt', '#']
count_vals_te['%'] = np.round(count_vals_te['#'] * 100 / np.sum(count_vals_te['#']),2)
count_vals_te


Unnamed: 0,shopping_pt,#,%
0,7,3718,19.16
1,8,3450,17.78
2,6,3125,16.11
3,9,2397,12.35
4,5,2254,11.62
5,4,1600,8.25
6,10,1214,6.26
7,3,1113,5.74
8,11,426,2.2
9,12,95,0.49


Proportions of different shopping pts are the same in both training and test datasets.
Now let's save them and use only the training dataset for next stage.


In [11]:
datapath = "WIP_data"

save_file(df_train, 'training_data.csv', datapath)
save_file(df_test, 'test_data.csv', datapath)


A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "WIP_data\training_data.csv"
A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "WIP_data\test_data.csv"


In [12]:
# Now we import only the training data into a dataframe

df_wide = pd.read_csv("WIP_data/training_data.csv")

df_wide.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous_13,duration_previous_13,A_13,B_13,C_13,D_13,E_13,F_13,G_13,cost_13
0,10109793,9,1,4,14:44,CO,13320,1,1,3,...,,,,,,,,,,
1,10002231,8,1,5,13:36,OH,10601,1,0,13,...,,,,,,,,,,
2,10150024,8,1,4,15:33,OH,10081,3,1,20,...,,,,,,,,,,
3,10003949,8,1,4,09:13,FL,10302,2,1,7,...,,,,,,,,,,
4,10103809,10,1,2,11:06,FL,14844,1,1,9,...,,,,,,,,,,


In [13]:
df_wide.shape

(77607, 259)

In [14]:
# The training data has the entire shopping history of the customer.
# we want to predict the final product vectors (target) based on early shopping history of the customer
# So we define a class that contains the entire shopping history and has an in-built function to extract 
# any number of shopping points as we want

class QuoteHistory:
    #class to contain customer data and extract appropriate quote history
    
    def pass_data(self, dataframe):
        # initialize the shopping_history object with customer data across all shopping points
        self.data = dataframe
        
    def get_history(self, how = 'first2', quote_nos=[]):
        
        customer_data = ['group_size', 'homeowner', 'car_age', 'car_value', 'risk_factor', 'age_oldest', 'age_youngest', \
                 'married_couple', 'C_previous', 'duration_previous']
        product_vectors = ['A', 'B', 'C','D','E','F','G', 'cost']

        def hist_extract(quotes=[1,2]):
            
            df_temp = self.data[self.data['shopping_pt'] > np.max(quotes)]\
                                  [['customer_ID','shopping_pt','state', 'A', 'B', 'C','D','E','F','G', 'cost']]
            
            vector_cols = ['customer_ID']+[x+'_'+str(y) for x in product_vectors for y in quotes]
            customer_data_cols = ['customer_ID']+[x+'_'+str(np.max(quotes)) for x in customer_data]

            df_temp = df_temp.merge(self.data[customer_data_cols], on='customer_ID', how='left', suffixes=["",""])
            df_temp = df_temp.merge(self.data[vector_cols], on='customer_ID', how='left', suffixes=["",""])
            
            return df_temp

  
        if how == 'first2':
                                
            return hist_extract([1,2])
        
        elif how == 'first3':
            return hist_extract([1,2,3])
        
        elif how == 'specific' and quote_nos != []:
            
            return hist_extract(quote_nos)
        
        elif how == 'last2':
                
            df_temp = self.data.iloc[:, 0:25]
            
            for index, row in self.data.iterrows():
                print(index)
                quote_second_last = self.data.loc[index, 'shopping_pt'] - 2
                quote_last = self.data.loc[index, 'shopping_pt'] - 1

                for feature in quote_features:
                    df_temp.loc[index, feature+'_2nd_last'] = self.data.loc[index, feature+"_q_"+str(quote_second_last)]
                    df_temp.loc[index, feature+'_last'] = self.data.loc[index, feature+"_q_"+str(quote_last)]
            
            return df_temp



In [15]:
df_2 = QuoteHistory()
df_2.pass_data(df_wide)

df_hist_first2 = df_2.get_history(how='first2')
df_hist_first2.head()

Unnamed: 0,customer_ID,shopping_pt,state,A,B,C,D,E,F,G,...,D_1,D_2,E_1,E_2,F_1,F_2,G_1,G_2,cost_1,cost_2
0,10109793,9,CO,1,1,3,3,0,2,1,...,3,3,0,0,2,2,1,1,656,656
1,10002231,8,OH,0,0,1,3,0,0,3,...,1,3,0,0,2,0,3,3,598,557
2,10150024,8,OH,1,1,2,3,0,2,3,...,3,3,0,0,3,3,3,3,617,617
3,10003949,8,FL,1,1,2,2,1,2,3,...,2,2,1,1,2,2,4,3,647,675
4,10103809,10,FL,1,1,1,3,1,1,3,...,3,3,1,1,1,1,4,3,637,617


In [16]:
df_hist_first2.columns

Index(['customer_ID', 'shopping_pt', 'state', 'A', 'B', 'C', 'D', 'E', 'F',
       'G', 'cost', 'group_size_2', 'homeowner_2', 'car_age_2', 'car_value_2',
       'risk_factor_2', 'age_oldest_2', 'age_youngest_2', 'married_couple_2',
       'C_previous_2', 'duration_previous_2', 'A_1', 'A_2', 'B_1', 'B_2',
       'C_1', 'C_2', 'D_1', 'D_2', 'E_1', 'E_2', 'F_1', 'F_2', 'G_1', 'G_2',
       'cost_1', 'cost_2'],
      dtype='object')

In [17]:
datapath = "WIP_data"

save_file(df_hist_first2, 'training_data_with_first2_quotes.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "WIP_data\training_data_with_first2_quotes.csv"


In [18]:
df_hist_2_3_4 = df_2.get_history(how='specific', quote_nos=[2,3,4])

#df_last2.columns

#df_2.get_history
df_hist_2_3_4.head()


Unnamed: 0,customer_ID,shopping_pt,state,A,B,C,D,E,F,G,...,E_4,F_2,F_3,F_4,G_2,G_3,G_4,cost_2,cost_3,cost_4
0,10109793,9,CO,1,1,3,3,0,2,1,...,0.0,2,2,2.0,1,1,1.0,656,656,656.0
1,10002231,8,OH,0,0,1,3,0,0,3,...,0.0,0,0,0.0,3,3,3.0,557,564,564.0
2,10150024,8,OH,1,1,2,3,0,2,3,...,0.0,3,3,2.0,3,3,2.0,617,617,624.0
3,10003949,8,FL,1,1,2,2,1,2,3,...,1.0,2,2,2.0,3,3,3.0,675,675,675.0
4,10103809,10,FL,1,1,1,3,1,1,3,...,1.0,1,1,1.0,3,3,3.0,617,617,617.0


In [19]:
df_hist_2_3_4.columns

Index(['customer_ID', 'shopping_pt', 'state', 'A', 'B', 'C', 'D', 'E', 'F',
       'G', 'cost', 'group_size_4', 'homeowner_4', 'car_age_4', 'car_value_4',
       'risk_factor_4', 'age_oldest_4', 'age_youngest_4', 'married_couple_4',
       'C_previous_4', 'duration_previous_4', 'A_2', 'A_3', 'A_4', 'B_2',
       'B_3', 'B_4', 'C_2', 'C_3', 'C_4', 'D_2', 'D_3', 'D_4', 'E_2', 'E_3',
       'E_4', 'F_2', 'F_3', 'F_4', 'G_2', 'G_3', 'G_4', 'cost_2', 'cost_3',
       'cost_4'],
      dtype='object')

In [20]:
datapath = "WIP_data"

save_file(df_hist_2_3_4, 'training_data_with_quotes_2_3_4.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "WIP_data\training_data_with_quotes_2_3_4.csv"


In [29]:
# Now let's look at the column info for the training data with the 1st and 2nd shoppinh quote

df_train_first2 = pd.read_csv('WIP_data/training_data_with_first2_quotes.csv')
df_train_first2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77607 entries, 0 to 77606
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer_ID          77607 non-null  int64  
 1   shopping_pt          77607 non-null  int64  
 2   state                77607 non-null  object 
 3   A                    77607 non-null  int64  
 4   B                    77607 non-null  int64  
 5   C                    77607 non-null  int64  
 6   D                    77607 non-null  int64  
 7   E                    77607 non-null  int64  
 8   F                    77607 non-null  int64  
 9   G                    77607 non-null  int64  
 10  cost                 77607 non-null  int64  
 11  group_size_2         77607 non-null  int64  
 12  homeowner_2          77607 non-null  int64  
 13  car_age_2            77607 non-null  int64  
 14  car_value_2          77607 non-null  int64  
 15  risk_factor_2        77607 non-null 

In [30]:
df_train_first2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_ID,77607.0,10076590.0,44051.955238,10000005.0,10038508.0,10076430.0,10114818.5,10152723.0
shopping_pt,77607.0,6.857564,1.998387,3.0,5.0,7.0,8.0,13.0
A,77607.0,0.9449534,0.618763,0.0,1.0,1.0,1.0,2.0
B,77607.0,0.4768771,0.499468,0.0,0.0,0.0,1.0,1.0
C,77607.0,2.294162,1.002481,1.0,1.0,2.0,3.0,4.0
D,77607.0,2.517415,0.713359,1.0,2.0,3.0,3.0,3.0
E,77607.0,0.4621233,0.498567,0.0,0.0,0.0,1.0,1.0
F,77607.0,1.166712,0.947975,0.0,0.0,1.0,2.0,3.0
G,77607.0,2.279923,0.886789,1.0,2.0,2.0,3.0,4.0
cost,77607.0,634.4441,43.013336,288.0,605.0,634.0,663.0,839.0


We see that all the product vector columns are stored as integers. These features are categorical as customer is choosing discrete options for the product vector. So they must be changed to categorical i.e. object.
Similarly, there are certain customer info features that have discrete values but are stored as integers. These are:

 - group_size_2: Takes values (1,2,3,4)           
 
 - homeowner_2: Takes values (1,2,3,4)
 
 - car_value_2: Takes values (1 to 9)
 
 - risk_factor_2: Takes values (0,1,2,3,4)
 
 - married_couple_2: Takes values (0,1)
 
 - C_previous_2: Takes values (1,2,3,4)
 
 - shopping_pt : Takes value (1 to 13)
 
 The above features along with product vectors need to be converted to categorical.
 
 The features that take continuous values are cost, car_age_2, age_oldest_2, age_youngest_2, and duration_previous_2. Only duration previous is stored as float, the rest are stored as integers. We shal convert the others to float as well.
 

In [34]:

num_features = ['cost', 'cost_1', 'cost_2', 'car_age_2', 'age_oldest_2', 'age_youngest_2', 'duration_previous_2']

catg_features = [x for x in df_hist_first2.columns if x not in num_features]

df_train_first2[catg_features] = df_train_first2[catg_features].astype('object')
df_train_first2[num_features] = df_train_first2[num_features].astype('float')


In [35]:
df_train_first2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77607 entries, 0 to 77606
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer_ID          77607 non-null  object 
 1   shopping_pt          77607 non-null  object 
 2   state                77607 non-null  object 
 3   A                    77607 non-null  object 
 4   B                    77607 non-null  object 
 5   C                    77607 non-null  object 
 6   D                    77607 non-null  object 
 7   E                    77607 non-null  object 
 8   F                    77607 non-null  object 
 9   G                    77607 non-null  object 
 10  cost                 77607 non-null  float64
 11  group_size_2         77607 non-null  object 
 12  homeowner_2          77607 non-null  object 
 13  car_age_2            77607 non-null  float64
 14  car_value_2          77607 non-null  object 
 15  risk_factor_2        77607 non-null 

In [37]:
# From this dataframe, we need to drop the customer_ID and target(final product vector) columns to create our features dataset.
# Also, we shall define y_train with the target features

X_train = df_train_first2.drop(columns=['customer_ID', 'A','B','C','D','E','F','G'])
y_train = df_train_first2[['A','B','C','D','E','F','G']]



In [38]:
X_train.shape, y_train.shape

((77607, 29), (77607, 7))

In [42]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_train.shape

(77607, 102)

In [54]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cost,77607.0,634.444097,43.013336,288.0,605.0,634.0,663.0,839.0
car_age_2,77607.0,8.223060,5.794034,0.0,4.0,8.0,12.0,85.0
age_oldest_2,77607.0,45.065857,17.422855,18.0,29.0,44.0,60.0,75.0
age_youngest_2,77607.0,42.744649,17.469472,16.0,27.0,40.0,57.0,75.0
duration_previous_2,77607.0,5.971001,4.670398,0.0,2.0,5.0,9.0,15.0
...,...,...,...,...,...,...,...,...
G_1_3,77607.0,0.213125,0.409518,0.0,0.0,0.0,0.0,1.0
G_1_4,77607.0,0.199093,0.399321,0.0,0.0,0.0,0.0,1.0
G_2_2,77607.0,0.423441,0.494107,0.0,0.0,0.0,1.0,1.0
G_2_3,77607.0,0.268752,0.443313,0.0,0.0,0.0,1.0,1.0


In [55]:
# Next, we do dummy encoding for the categorical columns and scale the numeric columns
# 

scaler_train_data = StandardScaler().fit(X_train[num_features])

X_train[num_features] = scaler_train_data.transform(X_train[num_features])



In [56]:
X_train.head()

Unnamed: 0,cost,car_age_2,age_oldest_2,age_youngest_2,duration_previous_2,cost_1,cost_2,shopping_pt_4,shopping_pt_5,shopping_pt_6,...,F_1_3,F_2_1,F_2_2,F_2_3,G_1_2,G_1_3,G_1_4,G_2_2,G_2_3,G_2_4
0,-0.568295,-0.901461,-0.00378,0.129103,-0.636138,0.464913,0.467906,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,-1.916722,0.824464,-1.209101,-1.073002,0.862673,-0.677346,-1.614463,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,-0.405554,2.032611,-0.061176,-1.359217,-0.636138,-0.303158,-0.352421,0,0,0,...,1,0,0,1,0,1,0,0,1,0
3,0.942874,-0.211091,-0.692535,-0.615057,-0.636138,0.287666,0.867553,0,0,0,...,0,0,1,0,0,0,1,0,1,0
4,-0.405554,0.134094,-1.036912,-0.901272,-0.422023,0.090725,-0.352421,0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [57]:
# We can now save the scaled and encoded training features as well as target into separate files

datapath2 = "Transformed_data"

save_file(X_train, 'pre_processed_training_data_with_quotes_1_2.csv', datapath2)
save_file(y_train, 'training_data_target_columns.csv', datapath2)


Directory Transformed_data was created.
Writing file.  "Transformed_data\pre_processed_training_data_with_quotes_1_2.csv"
Writing file.  "Transformed_data\training_data_target_columns.csv"
