# Preprocessing Part 2 - Cleaning and Merging Feature Creation in customers.csv

### Run on ml.t3.xlarge instance

In [1]:
%%capture
!pip install numpy
!pip install pandas
!pip install boto3
!pip install matplotlib
!pip install seaborn
!pip install datetime
!pip install awswrangler
!pip install sklearn

In [2]:
import numpy as np
import pandas as pd
import boto3
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import awswrangler as wr

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import OrdinalEncoder

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

pd.options.display.float_format = '{:.4f}'.format
pd.options.mode.chained_assignment = None

### Loading Data via S3

In [3]:
bucket='ads-508-group-6-raw'
data_key = 'customers.csv'

s3 = boto3.client('s3')
customer_obj = s3.get_object(Bucket = bucket, Key = data_key)

customers_df = pd.read_csv(customer_obj['Body'])

### Loading Data Locally

In [4]:
#customers_df = pd.read_csv("..\\data\\customers.csv")

In [5]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


## Fill Null Values

In [6]:
customers_df.isnull().sum()

customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16009
age                        15861
postal_code                    0
dtype: int64

In [7]:
simpleimputer_cat = SimpleImputer(strategy = 'constant', fill_value = 0)

In [8]:
customers_df['FN'] = simpleimputer_cat.fit_transform(customers_df[['FN']])
customers_df['Active'] = simpleimputer_cat.fit_transform(customers_df[['Active']])
customers_df['club_member_status'] = simpleimputer_cat.fit_transform(customers_df[['club_member_status']])
customers_df['fashion_news_frequency'] = simpleimputer_cat.fit_transform(customers_df[['fashion_news_frequency']])

In [9]:
simpleimputer_num = SimpleImputer(strategy = 'median')

In [10]:
customers_df['age'] = simpleimputer_num.fit_transform(customers_df[['age']])

In [11]:
customers_df.isnull().sum()

customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64

## Reducing Full Customers Table to Experiment Population

### Loading Data via S3

In [12]:
bucket='ads-508-group-6-processed'
data_key = 'quarter_transactions.csv'

s3 = boto3.client('s3')
customer_population_obj = s3.get_object(Bucket = bucket, Key = data_key)

df = pd.read_csv(customer_population_obj['Body'])

### Loading Data Locally

In [13]:
# df = pd.read_csv("..\\data\\quarter_transactions.csv")

In [14]:
df.head()

Unnamed: 0,customer_id,quarter_articles_purchased,quarter_amount_spent,qmc_sales_channel,qmc_product_type,qmc_graphical_appearance,qmc_perceived_colour_value,qmc_perceived_colour_master,qmc_department,qmc_index,qmc_index_group,qmc_section,qmc_garment_group,quarter_average_amount_spent_per_article,quarter_average_days_between_purchase
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,5,0.0936,2,252,1010006,1,5,1636,A,1,15,1005,0.0187,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,19,0.4726,2,59,1010001,4,2,4242,B,1,60,1018,0.0249,0.0118
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,3,0.0694,2,59,1010016,4,5,4242,B,1,60,1018,0.0231,0.0
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,2,0.0534,1,254,1010001,4,5,1447,D,2,53,1003,0.0267,0.0
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,21,0.6582,2,272,1010016,4,5,1747,A,1,53,1005,0.0313,0.0093


In [15]:
customers_df = customers_df[customers_df.customer_id.isin(df.customer_id)]

## Appending Feature Creation Fields

In [16]:
customers_df = pd.merge(customers_df, 
                       df,
                       how='left', 
                       on=['customer_id'])

### Loading Data via S3

In [17]:
bucket='ads-508-group-6-processed'
data_key = 'historical_transactions.csv'

s3 = boto3.client('s3')
customer_population_obj = s3.get_object(Bucket = bucket, Key = data_key)

df = pd.read_csv(customer_population_obj['Body'])

### Loading Data Locally

In [18]:
# df = pd.read_csv("..\\data\\historical_Transactions.csv")

In [19]:
df.head()

Unnamed: 0,customer_id,lifetime_articles_purchased,lieftime_amount_spent,lfmc_sales_channel,lfmc_product_type,lfmc_graphical_appearance,lfmc_perceived_colour_value,lfmc_perceived_colour_master,lfmc_department,lfmc_index,lfmc_index_group,lfmc_section,lfmc_garment_group,days_since_last_purchase,lifetime_average_amount_spent_per_article,lifetime_average_days_between_purchase
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,20,0.5982,2,264,1010016,4,5,1212,A,1,11,1005,10,0.0299,0.0444
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,71,2.2378,2,59,1010016,4,2,4242,B,1,60,1018,30,0.0315,0.1347
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,7,0.2151,2,59,1010016,4,5,4242,B,1,60,1017,57,0.0307,0.014
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,2,0.061,2,306,1010016,4,5,8316,S,26,5,1005,296,0.0305,0.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,6,0.2139,2,262,1010001,4,5,1201,A,1,11,1007,174,0.0356,0.0166


In [20]:
customers_df = pd.merge(customers_df, 
                       df,
                       how='left', 
                       on=['customer_id'])

## Appending Target Labels

### Loading Data via S3

In [21]:
bucket='ads-508-group-6-processed'
data_key = 'target_labels.csv'

s3 = boto3.client('s3')
labels_obj = s3.get_object(Bucket = bucket, Key = data_key)

df = pd.read_csv(labels_obj['Body'])

### Loading Data Locally

In [22]:
# df = pd.read_csv("..\\data\\target_labels.csv")

In [23]:
df.head()

Unnamed: 0,customer_id,churn
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0
1,000563485cbb7850b0a93c6606f89c5b961c6647d1bd48...,0
2,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,0
3,000eae69313b4fc1824fa7e439f168cc140bf4c3f3a7e9...,0
4,000ec422ba5459f0295c1e86872d61b746bb8f84345efd...,0


In [24]:
customers_df = pd.merge(customers_df, 
                        df, 
                        how='left', 
                        on=['customer_id'],)

In [25]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,quarter_articles_purchased,quarter_amount_spent,qmc_sales_channel,...,lfmc_perceived_colour_master,lfmc_department,lfmc_index,lfmc_index_group,lfmc_section,lfmc_garment_group,days_since_last_purchase,lifetime_average_amount_spent_per_article,lifetime_average_days_between_purchase,churn
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,5,0.0936,2,...,5,1212,A,1,11,1005,10,0.0299,0.0444,
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,19,0.4726,2,...,2,4242,B,1,60,1018,30,0.0315,0.1347,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,3,0.0694,2,...,5,4242,B,1,60,1017,57,0.0307,0.014,0.0
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,0.0,0.0,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,2,0.0534,1,...,5,1447,D,2,53,1003,86,0.0267,0.0,
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,1.0,1.0,ACTIVE,Regularly,56.0,b31984b20a8c478de38eaf113c581ff64e63c4242e607b...,21,0.6582,2,...,5,1722,A,1,15,1009,12,0.032,0.211,0.0


In [26]:
customers_df.isnull().sum()

customer_id                                       0
FN                                                0
Active                                            0
club_member_status                                0
fashion_news_frequency                            0
age                                               0
postal_code                                       0
quarter_articles_purchased                        0
quarter_amount_spent                              0
qmc_sales_channel                                 0
qmc_product_type                                  0
qmc_graphical_appearance                          0
qmc_perceived_colour_value                        0
qmc_perceived_colour_master                       0
qmc_department                                    0
qmc_index                                         0
qmc_index_group                                   0
qmc_section                                       0
qmc_garment_group                                 0
quarter_aver

In [27]:
customers_df['churn'] = customers_df['churn'].fillna(1)

In [28]:
customers_df = customers_df.drop(columns=['postal_code'])

### Saving to S3

In [29]:
wr.s3.to_csv(df = customers_df,
             path = "s3://ads-508-group-6-processed/customers.csv",
             index = False)

{'paths': ['s3://ads-508-group-6-processed/customers.csv'],
 'partitions_values': {}}

### Saving Locally

In [30]:
# cutomers_df.to_csv("..\\data\\customers.csv", index = False)

### Shutting Down Kernel To Release Resources

In [31]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}