# Import Libraries

In [1]:
import pandas as pd
from pandas_profiling import ProfileReport
import requests

# Read Transactions Dataset

In [2]:
file='C:/Users/Amit/Desktop/Internships/KPMG/dataset.xlsx'
df= pd.read_excel(file, sheet_name="NewCustomerList",header=1)
df=df.drop(['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18','Unnamed: 19', 'Unnamed: 20'],axis=1)
df.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9,4,1.703125


# Dataset Report

In [3]:
# To Create the Simple report quickly
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/32 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

# Observations

In [4]:
# data types seem fine.
df.dtypes

first_name                                     object
last_name                                      object
gender                                         object
past_3_years_bike_related_purchases             int64
DOB                                    datetime64[ns]
job_title                                      object
job_industry_category                          object
wealth_segment                                 object
deceased_indicator                             object
owns_car                                       object
tenure                                          int64
address                                        object
postcode                                        int64
state                                          object
country                                        object
property_valuation                              int64
Rank                                            int64
Value                                         float64
dtype: object

In [None]:
# Overview --> Dataset statistics --> Duplicate rows
# There are no duplicate rows based on all columns.

In [25]:
# All customers are unique in dataset.
df[['first_name','last_name','job_title']].drop_duplicates().shape

(1000, 3)

In [5]:
# All data is present for Australia only.
df['country'].unique()

array(['Australia'], dtype=object)

In [6]:
# All addresses are valid.
set(df['address'].apply(lambda x:x[0].isnumeric()))

{True}

In [7]:
# data seems to be accurate as per DOB.
df.sort_values(by=['DOB'])

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value
797,Anthony,Morison,Male,7,1938-06-08,General Manager,Health,Mass Customer,N,No,12,276 Derek Circle,2759,NSW,Australia,9,797,0.600000
885,Zachariah,Meininking,Male,40,1938-06-09,Cost Accountant,Financial Services,Affluent Customer,N,Yes,20,24815 Lindbergh Avenue,2749,NSW,Australia,7,883,0.531250
769,Andrea,Pendle,Female,86,1938-08-05,,,High Net Worth,N,Yes,13,31281 Meadow Valley Way,4500,QLD,Australia,6,760,0.637500
585,Letizia,Poore,Female,27,1938-08-30,Web Developer II,Manufacturing,Affluent Customer,N,No,15,95796 Mcbride Drive,3677,VIC,Australia,3,583,0.787500
265,Alleen,Eaken,Female,56,1938-08-31,,,Mass Customer,N,No,10,343 Lakewood Center,2089,NSW,Australia,12,259,1.062500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,Dmitri,Viant,U,62,NaT,Paralegal,Financial Services,Affluent Customer,N,No,5,95960 Warner Parkway,3842,VIC,Australia,1,774,0.626875
835,Porty,Hansed,U,88,NaT,General Manager,IT,Mass Customer,N,No,13,768 Southridge Drive,2112,NSW,Australia,11,832,0.575000
883,Shara,Bramhill,U,24,NaT,,IT,Affluent Customer,N,No,2,01 Bunker Hill Drive,2230,NSW,Australia,10,883,0.531250
904,Roth,Crum,U,0,NaT,Legal Assistant,IT,Mass Customer,N,No,2,276 Anthes Court,2450,NSW,Australia,6,904,0.500000


In [8]:
# Lower percentage of Affluent Customers have property valuation of 12.
# Higher percentage of Mass Customers have property valuation of 12.

# Higher percentage of Affluent Customers have property valuation of 1.
# Lower percentage of Mass Customers have property valuation of 1.

# This seems to be incorrect.
for i in df['wealth_segment'].unique():
    print(i)
    ans=df.loc[df['wealth_segment']==i]['property_valuation'].value_counts()
    print(ans/sum(ans))
    print('*'*50)

Mass Customer
9     0.185039
8     0.163386
7     0.141732
10    0.116142
5     0.066929
6     0.059055
11    0.057087
4     0.057087
12    0.043307
2     0.043307
3     0.039370
1     0.027559
Name: property_valuation, dtype: float64
**************************************************
Affluent Customer
9     0.182573
8     0.165975
7     0.120332
10    0.107884
6     0.070539
3     0.070539
11    0.058091
5     0.058091
2     0.058091
4     0.037344
1     0.037344
12    0.033195
Name: property_valuation, dtype: float64
**************************************************
High Net Worth
8     0.155378
7     0.147410
9     0.139442
10    0.123506
6     0.091633
11    0.075697
12    0.063745
4     0.059761
3     0.055777
5     0.035857
1     0.027888
2     0.023904
Name: property_valuation, dtype: float64
**************************************************


In [9]:
# approximately 50% of the Affluent and High Net Worth Customers don't even own a car.
# This seems strange.
for i in df['wealth_segment'].unique():
    print(i)
    ans=df.loc[df['wealth_segment']==i]['owns_car'].value_counts()
    print(ans/sum(ans))
    print('*'*50)

Mass Customer
Yes    0.5
No     0.5
Name: owns_car, dtype: float64
**************************************************
Affluent Customer
Yes    0.518672
No     0.481328
Name: owns_car, dtype: float64
**************************************************
High Net Worth
No     0.545817
Yes    0.454183
Name: owns_car, dtype: float64
**************************************************


In [14]:
df['state'].unique()

array(['QLD', 'NSW', 'VIC'], dtype=object)

In [20]:
post_code_list=[]
state_list=[]
for i in df['state'].unique():
    base_site = 'https://www.matthewproctor.com/full_australian_postcodes_'+i.lower()
    r = requests.get(base_site)
    table = pd.read_html(r.text,header=0)[0]
    post_code=list(table['Postcode'].unique())
    print(i,len(post_code))
    post_code_list+=post_code
    state_list+=[i.upper() for j in range(len(post_code))]

QLD 462
NSW 924
VIC 746


In [21]:
verification_table=pd.DataFrame(zip(post_code_list,state_list),columns=['postcode','state'])
verification_table

Unnamed: 0,postcode,state
0,4000,QLD
1,4001,QLD
2,4002,QLD
3,4003,QLD
4,4004,QLD
...,...,...
2127,8107,VIC
2128,8111,VIC
2129,8120,VIC
2130,8205,VIC


In [22]:
df3=df[['postcode','state']].drop_duplicates()
df3

Unnamed: 0,postcode,state
0,4500,QLD
1,2113,NSW
2,3505,VIC
3,4814,QLD
4,2093,NSW
...,...,...
985,3277,VIC
993,2422,NSW
994,3079,VIC
997,4702,QLD


In [23]:
# state corresponding to each postcode is correct
df3[~df3.isin(verification_table.to_dict('list')).all(axis=1)]

Unnamed: 0,postcode,state
