# Preprocessing customers.csv Part 3

In [1]:
import numpy as np
import pandas as pd
import boto3
import matplotlib.pyplot as plt
import seaborn as sns
import os
from numpy import inf

pd.options.display.float_format = '{:.4f}'.format

### Loading Data via S3

In [None]:
bucket='ads-508-group-6-raw'
data_key = 'customers_processed_part_1'

s3 = boto3.client('s3')
customers_obj = s3.get_object(Bucket = bucket, Key = data_key)

customers_df = pd.read_csv(customers_obj['Body'])

### Loading Data Locally

In [2]:
customers_df = pd.read_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\customers_processed_part_2.csv")

In [3]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,churn
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,1.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,0.0
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,0.0,0.0,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,1.0
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,1.0,1.0,ACTIVE,Regularly,56.0,b31984b20a8c478de38eaf113c581ff64e63c4242e607b...,0.0


### Appending Feature Creation Data Miniing

In [4]:
historical_df = pd.read_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\historical_transaction_agg.csv")

In [5]:
customers_df = pd.merge(customers_df, 
                       historical_df,
                       how='left', 
                       on=['customer_id'],)

In [6]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,churn,t_dat,article_id,price,sales_channel_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,1.0,450,20,0.5982,20
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,0.0,527,71,2.2378,71
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,0.0,501,7,0.2151,7
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,0.0,0.0,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,1.0,0,2,0.0534,2
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,1.0,1.0,ACTIVE,Regularly,56.0,b31984b20a8c478de38eaf113c581ff64e63c4242e607b...,0.0,526,111,3.5527,111


In [7]:
customers_df.rename(columns={"t_dat": "lifetime_days_min_max", 
                             "article_id": "lifetime_articles_purchased",
                             'price':"lifetime_amount_spent",
                             "sales_channel_id":"sales_channel_1_count"},
                   inplace = True)

In [8]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,churn,lifetime_days_min_max,lifetime_articles_purchased,lifetime_amount_spent,sales_channel_1_count
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,1.0,450,20,0.5982,20
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,0.0,527,71,2.2378,71
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,0.0,501,7,0.2151,7
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,0.0,0.0,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,1.0,0,2,0.0534,2
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,1.0,1.0,ACTIVE,Regularly,56.0,b31984b20a8c478de38eaf113c581ff64e63c4242e607b...,0.0,526,111,3.5527,111


In [9]:
customers_df['lifetime_sales_channel_1_proportion'] = customers_df['sales_channel_1_count']/customers_df['lifetime_articles_purchased']
customers_df['lifetime_average_amount_spent_per_article'] = customers_df['lifetime_amount_spent']/customers_df['lifetime_articles_purchased']
customers_df['lifetime_average_days_between_purchase'] = customers_df['lifetime_amount_spent']/customers_df['lifetime_days_min_max']

In [10]:
customers_df['lifetime_average_days_between_purchase'].replace(np.inf, 0, inplace = True)

In [11]:
customers_df = customers_df.drop(columns=['sales_channel_1_count'])

In [12]:
population_df = pd.read_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\population_transaction_agg.csv")

In [13]:
customers_df = pd.merge(customers_df, 
                       population_df,
                       how='left', 
                       on=['customer_id'],)

In [14]:
customers_df.rename(columns={"t_dat": "quarter_days_min_max", 
                             "article_id": "quarter_articles_purchased",
                             'price':"quarter_amount_spent",
                             "sales_channel_id":"quarter sales_channel_1_count"},
                   inplace = True)

In [15]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,churn,lifetime_days_min_max,lifetime_articles_purchased,lifetime_amount_spent,lifetime_sales_channel_1_proportion,lifetime_average_amount_spent_per_article,lifetime_average_days_between_purchase,quarter_days_min_max,quarter_articles_purchased,quarter_amount_spent,quarter sales_channel_1_count
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,1.0,450,20,0.5982,1.0,0.0299,0.0013,0,5,0.0936,5
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,0.0,527,71,2.2378,1.0,0.0315,0.0042,40,19,0.4726,19
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,0.0,501,7,0.2151,1.0,0.0307,0.0004,0,3,0.0694,3
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,0.0,0.0,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,1.0,0,2,0.0534,1.0,0.0267,0.0,0,2,0.0534,2
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,1.0,1.0,ACTIVE,Regularly,56.0,b31984b20a8c478de38eaf113c581ff64e63c4242e607b...,0.0,526,111,3.5527,1.0,0.032,0.0068,71,21,0.6582,21


In [17]:
customers_df['quarter_sales_channel_1_proportion'] = customers_df['quarter sales_channel_1_count']/customers_df['quarter_articles_purchased']
customers_df['quarter_average_amount_spent_per_article'] = customers_df['quarter_amount_spent']/customers_df['quarter_articles_purchased']
customers_df['quarter_average_days_between_purchase'] = customers_df['quarter_amount_spent']/customers_df['quarter_days_min_max']

In [18]:
customers_df['quarter_average_days_between_purchase'].replace(np.inf, 0, inplace = True)

In [19]:
customers_df = customers_df.drop(columns=['quarter sales_channel_1_count'])

In [20]:
customers_df.describe()

Unnamed: 0,FN,Active,age,churn,lifetime_days_min_max,lifetime_articles_purchased,lifetime_amount_spent,lifetime_sales_channel_1_proportion,lifetime_average_amount_spent_per_article,lifetime_average_days_between_purchase,quarter_days_min_max,quarter_articles_purchased,quarter_amount_spent,quarter_sales_channel_1_proportion,quarter_average_amount_spent_per_article,quarter_average_days_between_purchase
count,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0,469376.0
mean,0.3971,0.3907,35.7225,0.409,344.4508,34.7484,0.9875,1.0,0.0283,0.0034,16.9649,6.6598,0.1872,1.0,0.0286,0.0084
std,0.4893,0.4879,13.9363,0.4916,191.1189,43.8546,1.387,0.0,0.0109,0.0151,24.7167,8.009,0.2506,0.0,0.0146,0.0337
min,0.0,0.0,16.0,0.0,0.0,1.0,0.0008,1.0,0.0008,0.0,0.0,1.0,0.0001,1.0,0.0001,0.0
25%,0.0,0.0,24.0,0.0,206.0,8.0,0.2077,1.0,0.0219,0.0007,0.0,2.0,0.0508,1.0,0.0195,0.0
50%,0.0,0.0,31.0,0.0,432.0,21.0,0.543,1.0,0.0267,0.0016,0.0,4.0,0.1067,1.0,0.026,0.0
75%,1.0,1.0,48.0,1.0,497.0,45.0,1.2038,1.0,0.0327,0.0032,32.0,8.0,0.2235,1.0,0.0339,0.0061
max,1.0,1.0,99.0,1.0,558.0,1375.0,42.8785,1.0,0.422,3.6465,90.0,257.0,10.5247,1.0,0.5068,3.6465


In [1]:
customers_df.to_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\customers_processed_part_3.csv", 
                    index = False)

NameError: name 'customers_df' is not defined