# Import Libraries

In [1]:
import pandas as pd
from pandas_profiling import ProfileReport
import requests
from math import floor

# Read Data

In [2]:
file='C:/Users/Amit/Desktop/Internships/KPMG/dataset.xlsx'
df1 = pd.read_excel(file, sheet_name="CustomerDemographic",header=1)
df2 = pd.read_excel(file, sheet_name="CustomerAddress",header=1)
df3 = pd.read_excel(file, sheet_name="Transactions",header=1)
df4=pd.merge(df1,df2,on='customer_id')

# Pre-processing

In [3]:
# filter rows
df3=df3.loc[df3['order_status']=='Approved']
df4=df4.loc[df4['deceased_indicator']=='N']

# replacing values in columns
df4=df4.replace({'gender': {'F': 'Female','M' : 'Male','Femal':'Female'}})
df4=df4.replace({'state': {'New South Wales': 'NSW','Victoria' : 'VIC'}})

# filling missing values in tenure column
df4['tenure'] = df4['tenure'].fillna((floor(df4['tenure'].mean())))

# filling missing values in DOB column
u = df4.select_dtypes(include=['datetime'])
df4[u.columns] = u.fillna(pd.to_datetime('2017'))

# Feature Engineering

In [4]:
# getting age of customer
df4['age']=df4['DOB'].apply(lambda x: 2017- int(str(x)[:4]))
df4['age'].replace(0,39, inplace=True)

# creating target variable
df3['profit']=df3["list_price"]-df3["standard_cost"]

# Feature Selection

In [5]:
df4=df4[['customer_id','gender','age',
         'job_industry_category', 'wealth_segment', 'owns_car', 
         'tenure', 'state', 'property_valuation','past_3_years_bike_related_purchases']]

df3=df3[['customer_id','profit']].sort_values(by=['customer_id']).reset_index(drop=True)
df3=df3.groupby('customer_id',as_index=False).mean()

# Joining tables

In [6]:
df5=pd.merge(df4,df3,on='customer_id',how='inner').iloc[:,1:]
df5.head()

Unnamed: 0,gender,age,job_industry_category,wealth_segment,owns_car,tenure,state,property_valuation,past_3_years_bike_related_purchases,profit
0,Female,64,Health,Mass Customer,Yes,11.0,NSW,10,93,274.371818
1,Male,37,Financial Services,Mass Customer,Yes,16.0,NSW,10,81,742.086667
2,Male,56,IT,Mass Customer,No,7.0,QLD,9,33,110.285
3,Female,40,,Affluent Customer,Yes,8.0,NSW,4,56,399.156667
4,Male,51,Retail,High Net Worth,Yes,13.0,VIC,9,35,789.31


In [7]:
# removing duplicate rows
df5=df5.drop_duplicates()

# Encoding Variables

In [8]:
scale_mapper = {'Mass Customer':1, 
                'Affluent Customer':2,
                'High Net Worth':3}
# Map feature values to scale
df5['wealth_segment'] = df5['wealth_segment'].replace(scale_mapper)

In [9]:
x1 = pd.get_dummies(df5['gender'], prefix='gender')
x2 = pd.get_dummies(df5['job_industry_category'], prefix='job')
x3 = pd.get_dummies(df5['owns_car'], prefix='car')
x4 = pd.get_dummies(df5['state'], prefix='state')

In [10]:
X=pd.concat([x1,x2,x3,x4,df5[['age','wealth_segment','tenure','property_valuation']]],axis=1)
X=X.drop(['gender_U','job_Telecommunications','car_Yes','state_VIC'],axis=1)

data=pd.concat([X,df5['profit']],axis=1)
            
data.head()

Unnamed: 0,gender_Female,gender_Male,job_Argiculture,job_Entertainment,job_Financial Services,job_Health,job_IT,job_Manufacturing,job_Property,job_Retail,car_No,state_NSW,state_QLD,age,wealth_segment,tenure,property_valuation,profit
0,1,0,0,0,0,1,0,0,0,0,0,1,0,64,1,11.0,10,274.371818
1,0,1,0,0,1,0,0,0,0,0,0,1,0,37,1,16.0,10,742.086667
2,0,1,0,0,0,0,1,0,0,0,1,0,1,56,1,7.0,9,110.285
3,1,0,0,0,0,0,0,0,0,0,0,1,0,40,2,8.0,4,399.156667
4,0,1,0,0,0,0,0,0,0,1,0,0,0,51,3,13.0,9,789.31


In [11]:
data.shape

(3487, 18)

In [12]:
data.to_csv('train.csv',index=False)