# Preprocessing Part 3 - Performing Feature Transformation on Processed customers.csv

### Run on ml.t3.medium instance

In [4]:
%%capture
!pip install numpy
!pip install pandas
!pip install boto3
!pip install matplotlib
!pip install seaborn
!pip install datetime
!pip install awswrangler
!pip install sklearn

In [5]:
import numpy as np
import pandas as pd
import boto3
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import awswrangler as wr

from sklearn.preprocessing import RobustScaler

pd.options.display.float_format = '{:.4f}'.format
pd.options.mode.chained_assignment = None

### Loading Data via S3

In [6]:
bucket='ads-508-group-6-processed'
data_key = 'customers.csv'

s3 = boto3.client('s3')
customers_obj = s3.get_object(Bucket = bucket, Key = data_key)

customers_df = pd.read_csv(customers_obj['Body'])

### Loading Data Locally

In [7]:
# customers_df = pd.read_csv("..\\data\\customers.csv")

In [8]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,quarter_articles_purchased,quarter_amount_spent,quarter_average_amount_spent_per_article,quarter_average_days_between_purchase,lifetime_articles_purchased,lieftime_amount_spent,days_since_last_purchase,lifetime_average_amount_spent_per_article,lifetime_average_days_between_purchase,churn
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,5,0.0936,0.0187,0.0,20,0.5982,10,0.0299,0.0444,1.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,19,0.4726,0.0249,0.0118,71,2.2378,30,0.0315,0.1347,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,3,0.0694,0.0231,0.0,7,0.2151,57,0.0307,0.014,0.0
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,0.0,0.0,ACTIVE,NONE,20.0,2,0.0534,0.0267,0.0,2,0.0534,86,0.0267,0.0,1.0
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,1.0,1.0,ACTIVE,Regularly,56.0,21,0.6582,0.0313,0.0093,111,3.5527,12,0.032,0.211,0.0


### Data Transformation

In [9]:
cat_variables = ['club_member_status','fashion_news_frequency']
num_variables = ['age', 
                 'lifetime_articles_purchased', 
                 'lieftime_amount_spent', 
                 'lifetime_average_amount_spent_per_article', 
                 'lifetime_average_days_between_purchase',  
                 'quarter_articles_purchased', 
                 'quarter_amount_spent', 
                 'quarter_average_amount_spent_per_article', 
                 'quarter_average_days_between_purchase', 
                 'days_since_last_purchase']

In [10]:
cat_dummies = pd.get_dummies(customers_df[cat_variables])

In [11]:
transformer = RobustScaler()
num_normed = transformer.fit_transform(customers_df[num_variables])
scaled_features = pd.DataFrame(num_normed, columns = num_variables)

In [12]:
processed_df = pd.concat([customers_df['churn'], 
                          customers_df['FN'], 
                          customers_df['Active'],
                          cat_dummies, 
                          scaled_features],
                         axis = 1)

In [13]:
processed_df.head()

Unnamed: 0,churn,FN,Active,club_member_status_0,club_member_status_ACTIVE,club_member_status_LEFT CLUB,club_member_status_PRE-CREATE,fashion_news_frequency_0,fashion_news_frequency_Monthly,fashion_news_frequency_NONE,...,age,lifetime_articles_purchased,lieftime_amount_spent,lifetime_average_amount_spent_per_article,lifetime_average_days_between_purchase,quarter_articles_purchased,quarter_amount_spent,quarter_average_amount_spent_per_article,quarter_average_days_between_purchase,days_since_last_purchase
0,1.0,0.0,0.0,0,1,0,0,0,0,1,...,0.75,-0.027,0.0553,0.2982,-0.1928,0.1667,-0.0757,-0.5051,0.0,-0.6
1,0.0,0.0,0.0,0,1,0,0,0,0,1,...,-0.25,1.3514,1.7015,0.4473,0.8366,2.5,2.1183,-0.0767,1.9492,-0.1
2,0.0,0.0,0.0,0,1,0,0,0,0,1,...,-0.2917,-0.3784,-0.3292,0.3747,-0.5402,-0.1667,-0.2158,-0.197,0.0,0.575
3,1.0,0.0,0.0,0,1,0,0,0,0,1,...,-0.4583,-0.5135,-0.4916,-0.0007,-0.6995,-0.3333,-0.3089,0.0493,0.0,1.3
4,0.0,1.0,1.0,0,1,0,0,0,0,0,...,1.0417,2.4324,3.0215,0.4924,1.7067,2.8333,3.1932,0.3746,1.5296,-0.55


### Saving to S3

In [14]:
wr.s3.to_csv(df = processed_df,
             path = "s3://ads-508-group-6-processed/customers_transformed.csv",
             index = False)

{'paths': ['s3://ads-508-group-6-processed/customers_transformed.csv'],
 'partitions_values': {}}

### Saving Locally

In [15]:
# processed_df.to_csv("..\\data\\customers_transformed.csv", index = False)

### Shutting Down Kernel To Release Resources

In [16]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [17]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>