# Preprocessing Part 4 - Splitting, Rebalancing, and Formatting customers.csv for Modeling

### Run on ml.t3.2xlarge instance

In [1]:
%%capture
!pip install numpy
!pip install pandas
!pip install boto3
!pip install matplotlib
!pip install seaborn
!pip install imbalanced-learn
!pip install awswrangler
!pip install sklearn

In [2]:
import numpy as np
import pandas as pd
import boto3
import matplotlib.pyplot as plt
import seaborn as sns
import awswrangler as wr

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

pd.options.display.float_format = '{:.4f}'.format
pd.options.mode.chained_assignment = None

### Loading Data via S3

In [3]:
bucket='ads-508-group-6-processed'
data_key = 'customers_transformed.csv'

s3 = boto3.client('s3')
customers_obj = s3.get_object(Bucket = bucket, Key = data_key)

df = pd.read_csv(customers_obj['Body'])

### Loading Data Locally

In [4]:
# customers_df = pd.read_csv("..\\data\\customers_processed_part_3.csv")

In [5]:
df.head()

Unnamed: 0,churn,FN,Active,club_member_status_0,club_member_status_ACTIVE,club_member_status_LEFT CLUB,club_member_status_PRE-CREATE,fashion_news_frequency_0,fashion_news_frequency_Monthly,fashion_news_frequency_NONE,...,age,lifetime_articles_purchased,lieftime_amount_spent,lifetime_average_amount_spent_per_article,lifetime_average_days_between_purchase,quarter_articles_purchased,quarter_amount_spent,quarter_average_amount_spent_per_article,quarter_average_days_between_purchase,days_since_last_purchase
0,1.0,0.0,0.0,0,1,0,0,0,0,1,...,0.75,-0.027,0.0553,0.2982,-0.1928,0.1667,-0.0757,-0.5051,0.0,-0.6
1,0.0,0.0,0.0,0,1,0,0,0,0,1,...,-0.25,1.3514,1.7015,0.4473,0.8366,2.5,2.1183,-0.0767,1.9492,-0.1
2,0.0,0.0,0.0,0,1,0,0,0,0,1,...,-0.2917,-0.3784,-0.3292,0.3747,-0.5402,-0.1667,-0.2158,-0.197,0.0,0.575
3,1.0,0.0,0.0,0,1,0,0,0,0,1,...,-0.4583,-0.5135,-0.4916,-0.0007,-0.6995,-0.3333,-0.3089,0.0493,0.0,1.3
4,0.0,1.0,1.0,0,1,0,0,0,0,0,...,1.0417,2.4324,3.0215,0.4924,1.7067,2.8333,3.1932,0.3746,1.5296,-0.55


### Data Partitioning and Balancing

In [6]:
y = df['churn']
y.head()

0   1.0000
1   0.0000
2   0.0000
3   1.0000
4   0.0000
Name: churn, dtype: float64

In [7]:
X = df
X.drop(columns = 'churn', inplace = True)
X.head()

Unnamed: 0,FN,Active,club_member_status_0,club_member_status_ACTIVE,club_member_status_LEFT CLUB,club_member_status_PRE-CREATE,fashion_news_frequency_0,fashion_news_frequency_Monthly,fashion_news_frequency_NONE,fashion_news_frequency_Regularly,...,age,lifetime_articles_purchased,lieftime_amount_spent,lifetime_average_amount_spent_per_article,lifetime_average_days_between_purchase,quarter_articles_purchased,quarter_amount_spent,quarter_average_amount_spent_per_article,quarter_average_days_between_purchase,days_since_last_purchase
0,0.0,0.0,0,1,0,0,0,0,1,0,...,0.75,-0.027,0.0553,0.2982,-0.1928,0.1667,-0.0757,-0.5051,0.0,-0.6
1,0.0,0.0,0,1,0,0,0,0,1,0,...,-0.25,1.3514,1.7015,0.4473,0.8366,2.5,2.1183,-0.0767,1.9492,-0.1
2,0.0,0.0,0,1,0,0,0,0,1,0,...,-0.2917,-0.3784,-0.3292,0.3747,-0.5402,-0.1667,-0.2158,-0.197,0.0,0.575
3,0.0,0.0,0,1,0,0,0,0,1,0,...,-0.4583,-0.5135,-0.4916,-0.0007,-0.6995,-0.3333,-0.3089,0.0493,0.0,1.3
4,1.0,1.0,0,1,0,0,0,0,0,1,...,1.0417,2.4324,3.0215,0.4924,1.7067,2.8333,3.1932,0.3746,1.5296,-0.55


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.8, 
                                                    random_state=0, 
                                                    shuffle = True)

In [9]:
X_test, X_val, y_test, y_val = train_test_split(X_test, 
                                                y_test, 
                                                test_size=0.5, 
                                                random_state=0, 
                                                shuffle = True)

### Resampling

In [10]:
print('Target Feature Counts')
print(y_train.value_counts())

Target Feature Counts
0.0000    55336
1.0000    38539
Name: churn, dtype: int64


In [11]:
print('Target Feature Distribution')
print(y_train.value_counts(normalize = True))

Target Feature Distribution
0.0000   0.5895
1.0000   0.4105
Name: churn, dtype: float64


In [12]:
over_sampler = SMOTE(k_neighbors=2)
X_train_reb, y_train_reb = over_sampler.fit_resample(X_train, y_train)

print('Target Feature Representation after Rebalancing')
print(y_train_reb.value_counts(normalize = True))

Target Feature Representation after Rebalancing
0.0000   0.5000
1.0000   0.5000
Name: churn, dtype: float64


### Saving to S3 in correct format

In [13]:
y_train_reb = np.reshape(y_train_reb.values, (-1, 1))
train_full = np.hstack((y_train_reb, X_train_reb))
train_df = pd.DataFrame(train_full)

In [14]:
wr.s3.to_csv(df = train_df,
             path = "s3://ads-508-group-6-final/churn_model_data/train/data.csv",
             index = False)

{'paths': ['s3://ads-508-group-6-final/churn_model_data/train/data.csv'],
 'partitions_values': {}}

In [15]:
y_test = np.reshape(y_test.values, (-1, 1))
test_full = np.hstack((y_test, X_test))
test_df = pd.DataFrame(test_full)

In [16]:
wr.s3.to_csv(df = test_df,
             path = "s3://ads-508-group-6-final/churn_model_data/test/data.csv",
             index = False)

{'paths': ['s3://ads-508-group-6-final/churn_model_data/test/data.csv'],
 'partitions_values': {}}

In [17]:
y_val = np.reshape(y_val.values, (-1, 1))
val_full = np.hstack((y_val, X_val))
validation_df = pd.DataFrame(val_full)

In [18]:
wr.s3.to_csv(df = validation_df,
             path = "s3://ads-508-group-6-final/churn_model_data/validation/data.csv",
             index = False)

{'paths': ['s3://ads-508-group-6-final/churn_model_data/validation/data.csv'],
 'partitions_values': {}}

### Shutting Down Kernel To Release Resources

In [19]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [20]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>