# Preprocessing customers.csv Part 1

In [1]:
import numpy as np
import pandas as pd
import boto3
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import OrdinalEncoder

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

pd.options.display.float_format = '{:.4f}'.format

### Loading Data via S3

In [None]:
bucket='ads-508-group-6-raw'
data_key = 'customers.csv'

s3 = boto3.client('s3')
customers_obj = s3.get_object(Bucket = bucket, Key = data_key)

customers_df = pd.read_csv(customers_obj['Body'])

### Loading Data Locally

In [2]:
customers_df = pd.read_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\customers.csv")

In [3]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


### Fill Null Values

In [4]:
customers_df.isnull().sum()

customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16009
age                        15861
postal_code                    0
dtype: int64

In [5]:
simpleimputer_cat = SimpleImputer(strategy = 'constant', fill_value = 0)

In [6]:
customers_df['FN'] = simpleimputer_cat.fit_transform(customers_df[['FN']])
customers_df['Active'] = simpleimputer_cat.fit_transform(customers_df[['Active']])
customers_df['club_member_status'] = simpleimputer_cat.fit_transform(customers_df[['club_member_status']])
customers_df['fashion_news_frequency'] = simpleimputer_cat.fit_transform(customers_df[['fashion_news_frequency']])

In [7]:
customers_df.isnull().sum()

customer_id                   0
FN                            0
Active                        0
club_member_status            0
fashion_news_frequency        0
age                       15861
postal_code                   0
dtype: int64

In [8]:
simpleimputer_num = SimpleImputer(strategy = 'median')

In [9]:
customers_df['age'] = simpleimputer_num.fit_transform(customers_df[['age']])

In [10]:
customers_df.isnull().sum()

customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64

### Save to CSV

In [11]:
customers_df.to_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\customers_processed_part_1.csv", 
                    index = False)