# Preprocessing customers.csv Part 2

In [24]:
import numpy as np
import pandas as pd
import boto3
import matplotlib.pyplot as plt
import seaborn as sns
import os

pd.options.display.float_format = '{:.4f}'.format

### Loading Data via S3

In [None]:
bucket='ads-508-group-6-raw'
data_key = 'customers_processed_part_1'

s3 = boto3.client('s3')
customers_obj = s3.get_object(Bucket = bucket, Key = data_key)

customers_df = pd.read_csv(customers_obj['Body'])

### Loading Data Locally

In [25]:
customers_df = pd.read_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\customers_processed_part_1.csv")

In [26]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [27]:
customers_df.describe()

Unnamed: 0,FN,Active,age
count,1371980.0,1371980.0,1371980.0
mean,0.3476,0.3385,36.3362
std,0.4762,0.4732,14.2384
min,0.0,0.0,16.0
25%,0.0,0.0,24.0
50%,0.0,0.0,32.0
75%,1.0,1.0,49.0
max,1.0,1.0,99.0


### Loading Data via S3

In [None]:
bucket='ads-508-group-6-raw'
data_key = 'transaction_population'

s3 = boto3.client('s3')
customer_population_obj = s3.get_object(Bucket = bucket, Key = data_key)

customer_population_df = pd.read_csv(customer_population_obj['Body'])

### Loading Data Locally

In [28]:
transaction_population_df = pd.read_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\transactions_population.csv")

In [29]:
transaction_population_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-01-01,0034b3dced3e565a43438bdfb5447e7321fea65388b398...,835247001,0.0339,2
1,2020-01-01,00410b91d62eefa76958fa5cac12f5daa7cfc0556e417d...,802930002,0.0678,2
2,2020-01-01,00410b91d62eefa76958fa5cac12f5daa7cfc0556e417d...,760084008,0.0254,2
3,2020-01-01,004b0fb384bcab2f8e1059dd5ca68c17580365ab95c05a...,804662002,0.0339,2
4,2020-01-01,004b0fb384bcab2f8e1059dd5ca68c17580365ab95c05a...,801554002,0.0169,2


In [30]:
transaction_population_df.nunique()

t_dat                   91
customer_id         469376
article_id           41649
price                 6829
sales_channel_id         2
dtype: int64

### Loading Data via S3

In [None]:
bucket='ads-508-group-6-raw'
data_key = 'target_labels'

s3 = boto3.client('s3')
labels_obj = s3.get_object(Bucket = bucket, Key = data_key)

labels_df = pd.read_csv(labels_obj['Body'])

### Loading Data Locally

In [31]:
labels_df = pd.read_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\target_labels.csv")

In [32]:
labels_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-04-01,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,727808001,0.0678,2
1,2020-04-01,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,727808007,0.0678,2
2,2020-04-01,000563485cbb7850b0a93c6606f89c5b961c6647d1bd48...,567532015,0.0424,2
3,2020-04-01,000563485cbb7850b0a93c6606f89c5b961c6647d1bd48...,706104009,0.0424,2
4,2020-04-01,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,783504004,0.0191,2


In [33]:
labels_df.describe()

Unnamed: 0,article_id,price,sales_channel_id
count,4467204.0,4467204.0,4467204.0
mean,771633152.7204,0.0264,1.7802
std,119970811.4629,0.0156,0.4141
min,108775015.0,0.0,1.0
25%,729603001.0,0.0169,2.0
50%,816083002.0,0.0248,2.0
75%,852775002.0,0.0339,2.0
max,935356001.0,0.5013,2.0


In [34]:
labels_df.nunique()

t_dat                   91
customer_id         538352
article_id           39173
price                 5806
sales_channel_id         2
dtype: int64

### Reducing Full Customers Table to Experiment Population

In [35]:
customers_df = customers_df[customers_df.customer_id.isin(transaction_population_df.customer_id)]

In [36]:
customers_df.describe()

Unnamed: 0,FN,Active,age
count,469376.0,469376.0,469376.0
mean,0.3971,0.3907,35.7225
std,0.4893,0.4879,13.9363
min,0.0,0.0,16.0
25%,0.0,0.0,24.0
50%,0.0,0.0,31.0
75%,1.0,1.0,48.0
max,1.0,1.0,99.0


### Reducing Full Labels Table to Unique Users

In [37]:
labels_df = labels_df.drop_duplicates(subset = ['customer_id'])

In [38]:
labels_df.describe()

Unnamed: 0,article_id,price,sales_channel_id
count,538352.0,538352.0,538352.0
mean,764796290.9093,0.0277,1.786
std,121927004.6399,0.0166,0.4102
min,108775015.0,0.0002,1.0
25%,718634001.0,0.0169,2.0
50%,808698004.0,0.0254,2.0
75%,849361005.0,0.0339,2.0
max,935356001.0,0.3373,2.0


### Generating Target Labels

In [39]:
cutomers_merged = pd.merge(customers_df, 
                           labels_df, 
                           how='left', 
                           on=['customer_id'],)

In [40]:
cutomers_merged.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,t_dat,article_id,price,sales_channel_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,,,,
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,2020-04-22,599580055.0,0.0169,2.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,2020-04-01,727808001.0,0.0678,2.0
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,0.0,0.0,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,,,,
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,1.0,1.0,ACTIVE,Regularly,56.0,b31984b20a8c478de38eaf113c581ff64e63c4242e607b...,2020-04-24,562245099.0,0.0325,2.0


In [41]:
cutomers_merged['churn'] = cutomers_merged['sales_channel_id'].replace([1, 2],
                                                                       [0, 0])
cutomers_merged['churn'] = cutomers_merged['churn'].fillna(1)

In [21]:
cutomers_merged.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,t_dat,article_id,price,sales_channel_id,churn
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,,,,,1.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,2020-04-22,599580055.0,0.0169,2.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,2020-04-01,727808001.0,0.0678,2.0,0.0
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,0.0,0.0,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,,,,,1.0
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,1.0,1.0,ACTIVE,Regularly,56.0,b31984b20a8c478de38eaf113c581ff64e63c4242e607b...,2020-04-24,562245099.0,0.0325,2.0,0.0


In [42]:
cutomers_merged = cutomers_merged.drop(columns=['t_dat', 
                                                'article_id', 
                                                'price', 
                                                'sales_channel_id'])

In [43]:
cutomers_merged.to_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\customers_processed_part_2.csv", 
                    index = False)