# Preprocessing Part 3 - Performing Feature Transformation on Processed customers.csv

### Run on ml.t3.medium instance

In [1]:
%%capture
!pip install numpy
!pip install pandas
!pip install boto3
!pip install matplotlib
!pip install seaborn
!pip install datetime
!pip install awswrangler
!pip install sklearn

In [2]:
import numpy as np
import pandas as pd
import boto3
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import awswrangler as wr

from sklearn.preprocessing import RobustScaler

pd.options.display.float_format = '{:.4f}'.format
pd.options.mode.chained_assignment = None

### Loading Data via S3

In [3]:
bucket='ads-508-group-6-processed'
data_key = 'customers.csv'

s3 = boto3.client('s3')
customers_obj = s3.get_object(Bucket = bucket, Key = data_key)

customers_df = pd.read_csv(customers_obj['Body'])

### Loading Data Locally

In [4]:
# customers_df = pd.read_csv("..\\data\\customers.csv")

In [5]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,quarter_articles_purchased,quarter_amount_spent,qmc_sales_channel,qmc_product_type,...,lfmc_perceived_colour_master,lfmc_department,lfmc_index,lfmc_index_group,lfmc_section,lfmc_garment_group,days_since_last_purchase,lifetime_average_amount_spent_per_article,lifetime_average_days_between_purchase,churn
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,5,0.0936,2,252,...,5,1212,A,1,11,1005,10,0.0299,0.0444,1.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,19,0.4726,2,59,...,2,4242,B,1,60,1018,30,0.0315,0.1347,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,3,0.0694,2,59,...,5,4242,B,1,60,1017,57,0.0307,0.014,0.0
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,0.0,0.0,ACTIVE,NONE,20.0,2,0.0534,1,254,...,5,1447,D,2,53,1003,86,0.0267,0.0,1.0
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,1.0,1.0,ACTIVE,Regularly,56.0,21,0.6582,2,272,...,5,1722,A,1,15,1009,12,0.032,0.211,0.0


In [6]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 469376 entries, 0 to 469375
Data columns (total 36 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   customer_id                                469376 non-null  object 
 1   FN                                         469376 non-null  float64
 2   Active                                     469376 non-null  float64
 3   club_member_status                         469376 non-null  object 
 4   fashion_news_frequency                     469376 non-null  object 
 5   age                                        469376 non-null  float64
 6   quarter_articles_purchased                 469376 non-null  int64  
 7   quarter_amount_spent                       469376 non-null  float64
 8   qmc_sales_channel                          469376 non-null  int64  
 9   qmc_product_type                           469376 non-null  int64  
 10  qmc_grap

### Recasting Variables

In [7]:
customers_df = customers_df.astype({'qmc_sales_channel': str,
                                    'qmc_product_type': str,
                                    'qmc_graphical_appearance': str,
                                    'qmc_perceived_colour_value': str,
                                    'qmc_perceived_colour_master': str,
                                    'qmc_department': str,
                                    'qmc_index': str,
                                    'qmc_index_group': str,
                                    'qmc_section': str,
                                    'qmc_garment_group': str,                  
                                    'lfmc_sales_channel': str,
                                    'lfmc_product_type': str,
                                    'lfmc_graphical_appearance': str,
                                    'lfmc_perceived_colour_value': str,
                                    'lfmc_perceived_colour_master': str,
                                    'lfmc_department': str,
                                    'lfmc_index': str,
                                    'lfmc_index_group': str,
                                    'lfmc_section': str,
                                    'lfmc_garment_group': str})

### Data Transformation

In [8]:
cat_variables = ['club_member_status',
                 'fashion_news_frequency', 
                 'qmc_sales_channel',
                 'qmc_product_type',
                 'qmc_graphical_appearance',
                 'qmc_perceived_colour_value',
                 'qmc_perceived_colour_master',
                 'qmc_department',
                 'qmc_index',
                 'qmc_index_group',
                 'qmc_section',
                 'qmc_garment_group',                  
                 'lfmc_sales_channel',
                 'lfmc_product_type',
                 'lfmc_graphical_appearance',
                 'lfmc_perceived_colour_value',
                 'lfmc_perceived_colour_master',
                 'lfmc_department',
                 'lfmc_index',
                 'lfmc_index_group',
                 'lfmc_section',
                 'lfmc_garment_group']

num_variables = ['age', 
                 'lifetime_articles_purchased', 
                 'lieftime_amount_spent', 
                 'lifetime_average_amount_spent_per_article', 
                 'lifetime_average_days_between_purchase',  
                 'quarter_articles_purchased', 
                 'quarter_amount_spent', 
                 'quarter_average_amount_spent_per_article', 
                 'quarter_average_days_between_purchase', 
                 'days_since_last_purchase']

In [9]:
cat_dummies = pd.get_dummies(customers_df[cat_variables])

In [10]:
transformer = RobustScaler()
num_normed = transformer.fit_transform(customers_df[num_variables])
scaled_features = pd.DataFrame(num_normed, columns = num_variables)

In [11]:
processed_df = pd.concat([customers_df['churn'], 
                          customers_df['FN'], 
                          customers_df['Active'],
                          cat_dummies, 
                          scaled_features],
                         axis = 1)

In [12]:
processed_df.head()

Unnamed: 0,churn,FN,Active,club_member_status_0,club_member_status_ACTIVE,club_member_status_LEFT CLUB,club_member_status_PRE-CREATE,fashion_news_frequency_0,fashion_news_frequency_Monthly,fashion_news_frequency_NONE,...,age,lifetime_articles_purchased,lieftime_amount_spent,lifetime_average_amount_spent_per_article,lifetime_average_days_between_purchase,quarter_articles_purchased,quarter_amount_spent,quarter_average_amount_spent_per_article,quarter_average_days_between_purchase,days_since_last_purchase
0,1.0,0.0,0.0,0,1,0,0,0,0,1,...,0.75,-0.027,0.0553,0.2982,-0.1928,0.1667,-0.0757,-0.5051,0.0,-0.6
1,0.0,0.0,0.0,0,1,0,0,0,0,1,...,-0.25,1.3514,1.7015,0.4473,0.8366,2.5,2.1183,-0.0767,1.9492,-0.1
2,0.0,0.0,0.0,0,1,0,0,0,0,1,...,-0.2917,-0.3784,-0.3292,0.3747,-0.5402,-0.1667,-0.2158,-0.197,0.0,0.575
3,1.0,0.0,0.0,0,1,0,0,0,0,1,...,-0.4583,-0.5135,-0.4916,-0.0007,-0.6995,-0.3333,-0.3089,0.0493,0.0,1.3
4,0.0,1.0,1.0,0,1,0,0,0,0,0,...,1.0417,2.4324,3.0215,0.4924,1.7067,2.8333,3.1932,0.3746,1.5296,-0.55


### Saving to S3

In [13]:
wr.s3.to_csv(df = processed_df,
             path = "s3://ads-508-group-6-processed/customers_transformed.csv",
             index = False)

{'paths': ['s3://ads-508-group-6-processed/customers_transformed.csv'],
 'partitions_values': {}}

### Saving Locally

In [14]:
# processed_df.to_csv("..\\data\\customers_transformed.csv", index = False)

### Shutting Down Kernel To Release Resources

In [15]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [16]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>