In [1]:
##Import Dependencies

from path import Path
import pandas as pd
import numpy as np

import re

from sklearn.impute import SimpleImputer

#### To upload the data we create a connection to Postgres/PGadmin. 

In [None]:
#create connection string
db_string =f'postgresql://postgres:{db_password}@127.0.0.1:5432/Project Insights on the Beach'
engine = create_engine(db_string)
#read in the SQL data/ We dropped the customer_Id because it does not bring value to the analysis
df= pd.read_sql_query('''SELECT*FROM cust_marketing_table;''',engine)
df

In [2]:
#Loading in Data CSSV
data = Path('../Resources/Travel.csv')
vacay_df = pd.read_csv(data)
vacay_df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [3]:
#Return row counts of non-null values
vacay_df.count()

CustomerID                  4888
ProdTaken                   4888
Age                         4662
TypeofContact               4863
CityTier                    4888
DurationOfPitch             4637
Occupation                  4888
Gender                      4888
NumberOfPersonVisiting      4888
NumberOfFollowups           4843
ProductPitched              4888
PreferredPropertyStar       4862
MaritalStatus               4888
NumberOfTrips               4748
Passport                    4888
PitchSatisfactionScore      4888
OwnCar                      4888
NumberOfChildrenVisiting    4822
Designation                 4888
MonthlyIncome               4655
dtype: int64

In [74]:
# Number of unique row values per column
vacay_unique=vacay_df.nunique()
vacay_unique

CustomerID                  4888
ProdTaken                      2
Age                           44
TypeofContact                  2
CityTier                       3
DurationOfPitch               34
Occupation                     4
Gender                         3
NumberOfPersonVisiting         5
NumberOfFollowups              6
ProductPitched                 5
PreferredPropertyStar          3
MaritalStatus                  4
NumberOfTrips                 12
Passport                       2
PitchSatisfactionScore         5
OwnCar                         2
NumberOfChildrenVisiting       4
Designation                    5
MonthlyIncome               2475
dtype: int64

In [75]:
#How many Null Values are there per Column
vacay_null=vacay_df.isnull().sum(axis=0)
vacay_null

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [76]:
##Merge the count dataframe, unique value df and null dataframe and apply header
vacay_eda_merged = pd.concat({"ValueCounts":vacay_df.count(),"UniqueValues": vacay_unique, "Nulls": vacay_null},axis =1, join = "inner")
vacay_eda_merged


Unnamed: 0,ValueCounts,UniqueValues,Nulls
CustomerID,4888,4888,0
ProdTaken,4888,2,0
Age,4662,44,226
TypeofContact,4863,2,25
CityTier,4888,3,0
DurationOfPitch,4637,34,251
Occupation,4888,4,0
Gender,4888,3,0
NumberOfPersonVisiting,4888,5,0
NumberOfFollowups,4843,6,45


In [77]:
# return values in Gender Column
vacay_df.groupby('Gender').size()

Gender
Fe Male     155
Female     1817
Male       2916
dtype: int64

In [78]:
#Replace all values of 'Fe Male' with Female, DataCleaning
vacay_df=vacay_df.replace("Fe Male", "Female")
vacay_df.groupby('Gender').size()

Gender
Female    1972
Male      2916
dtype: int64

In [79]:
# return values in MaritalStatus Column
vacay_df.groupby('MaritalStatus').size()

MaritalStatus
Divorced      950
Married      2340
Single        916
Unmarried     682
dtype: int64

In [80]:
#replace Divorce and Unmarried value with Single
vacay_df = vacay_df.replace(["Divorced", "Unmarried"], ["Single","Single"])
#CheckResults
vacay_df.groupby('MaritalStatus').size()

MaritalStatus
Married    2340
Single     2548
dtype: int64

In [81]:
# Drop Column believed to be unimportant to analysis
vacay_df.drop('OwnCar', inplace=True, axis = 1)
vacay_df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Single,2.0,0,3,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Single,2.0,1,5,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Single,1.0,0,5,0.0,Executive,18468.0


In [82]:
# Verify values in age made sense
vacay_df.groupby('Age').size()

Age
18.0     14
19.0     32
20.0     38
21.0     41
22.0     46
23.0     46
24.0     56
25.0     74
26.0    106
27.0    138
28.0    147
29.0    178
30.0    199
31.0    203
32.0    197
33.0    189
34.0    211
35.0    237
36.0    231
37.0    185
38.0    176
39.0    150
40.0    146
41.0    155
42.0    142
43.0    130
44.0    105
45.0    116
46.0    121
47.0     88
48.0     65
49.0     65
50.0     86
51.0     90
52.0     68
53.0     66
54.0     61
55.0     64
56.0     58
57.0     29
58.0     31
59.0     44
60.0     29
61.0      9
dtype: int64

In [83]:
# Verify values in CityTier made sense
vacay_df.groupby('CityTier').size()

CityTier
1    3190
2     198
3    1500
dtype: int64

In [84]:
# Verify values in DurationOfPitch made sense
vacay_df.groupby('DurationOfPitch').size()

DurationOfPitch
5.0        6
6.0      307
7.0      342
8.0      333
9.0      483
10.0     244
11.0     205
12.0     195
13.0     223
14.0     253
15.0     269
16.0     274
17.0     172
18.0      75
19.0      57
20.0      65
21.0      73
22.0      89
23.0      79
24.0      70
25.0      73
26.0      72
27.0      72
28.0      61
29.0      74
30.0      95
31.0      83
32.0      74
33.0      57
34.0      50
35.0      66
36.0      44
126.0      1
127.0      1
dtype: int64

In [85]:
# Verify values in Occupation made sense
vacay_df.groupby('Occupation').size()

Occupation
Free Lancer          2
Large Business     434
Salaried          2368
Small Business    2084
dtype: int64

In [86]:
# Verify values in NumberOfPersonVisiting made sense
vacay_df.groupby('NumberOfPersonVisiting').size()

NumberOfPersonVisiting
1      39
2    1418
3    2402
4    1026
5       3
dtype: int64

In [87]:
# Verify values in NumberOfFollowups made sense
vacay_df.groupby('NumberOfFollowups').size()

NumberOfFollowups
1.0     176
2.0     229
3.0    1466
4.0    2068
5.0     768
6.0     136
dtype: int64

In [88]:
# Verify values in NumberOfPersonVisiting made sense
vacay_df.groupby('ProductPitched').size()

ProductPitched
Basic           1842
Deluxe          1732
King             230
Standard         742
Super Deluxe     342
dtype: int64

In [89]:
# Verify values in PreferredPropertyStar made sense
vacay_df.groupby('PreferredPropertyStar').size()

PreferredPropertyStar
3.0    2993
4.0     913
5.0     956
dtype: int64

In [90]:
# Verify values in NumberOfTrips made sense
vacay_df.groupby('NumberOfTrips').size()

NumberOfTrips
1.0      620
2.0     1464
3.0     1079
4.0      478
5.0      458
6.0      322
7.0      218
8.0      105
19.0       1
20.0       1
21.0       1
22.0       1
dtype: int64

In [91]:
# Verify values in Passport made sense
vacay_df.groupby('Passport').size()

Passport
0    3466
1    1422
dtype: int64

In [92]:
# Verify values in PitchSatisfactionScore made sense
vacay_df.groupby('PitchSatisfactionScore').size()

PitchSatisfactionScore
1     942
2     586
3    1478
4     912
5     970
dtype: int64

In [93]:
# Verify values in NumberOfPersonVisiting made sense
vacay_df.groupby('NumberOfChildrenVisiting').size()

NumberOfChildrenVisiting
0.0    1082
1.0    2080
2.0    1335
3.0     325
dtype: int64

In [94]:
# Verify values in Designation made sense
vacay_df.groupby('Designation').size()

Designation
AVP                342
Executive         1842
Manager           1732
Senior Manager     742
VP                 230
dtype: int64

In [95]:
# Verify values in MonthlyIncome made sense
vacay_df.groupby('MonthlyIncome').size()

MonthlyIncome
1000.0     1
4678.0     1
16009.0    2
16051.0    2
16052.0    2
          ..
38621.0    2
38651.0    2
38677.0    2
95000.0    1
98678.0    1
Length: 2475, dtype: int64

In [96]:
#Find Summary Info to determine Means of Columns to investigate replacing Nulls with Mean Info
vacay_df.mean()

  


CustomerID                  202443.500000
ProdTaken                        0.188216
Age                             37.622265
CityTier                         1.654255
DurationOfPitch                 15.490835
NumberOfPersonVisiting           2.905074
NumberOfFollowups                3.708445
PreferredPropertyStar            3.581037
NumberOfTrips                    3.236521
Passport                         0.290917
PitchSatisfactionScore           3.078151
NumberOfChildrenVisiting         1.187267
MonthlyIncome                23619.853491
dtype: float64

In [97]:
# Number of unique row values per column
vacay_unique=vacay_df.nunique()
#How many Null Values are there per Column
vacay_null=vacay_df.isnull().sum(axis=0)
#Three df Merged
vacay_eda_merged = pd.concat({"ValueCounts":vacay_df.count(),"UniqueValues": vacay_unique, "Nulls": vacay_null},axis =1, join = "inner")
vacay_eda_merged

Unnamed: 0,ValueCounts,UniqueValues,Nulls
CustomerID,4888,4888,0
ProdTaken,4888,2,0
Age,4662,44,226
TypeofContact,4863,2,25
CityTier,4888,3,0
DurationOfPitch,4637,34,251
Occupation,4888,4,0
Gender,4888,2,0
NumberOfPersonVisiting,4888,5,0
NumberOfFollowups,4843,6,45


In [98]:
vacay_drop_null = vacay_df.dropna()

In [99]:
vacay_drop_null.count()

CustomerID                  4128
ProdTaken                   4128
Age                         4128
TypeofContact               4128
CityTier                    4128
DurationOfPitch             4128
Occupation                  4128
Gender                      4128
NumberOfPersonVisiting      4128
NumberOfFollowups           4128
ProductPitched              4128
PreferredPropertyStar       4128
MaritalStatus               4128
NumberOfTrips               4128
Passport                    4128
PitchSatisfactionScore      4128
NumberOfChildrenVisiting    4128
Designation                 4128
MonthlyIncome               4128
dtype: int64

In [100]:
vacay_mean = vacay_drop_null.mean()
vacay_mean

  """Entry point for launching an IPython kernel.


CustomerID                  202527.763808
ProdTaken                        0.193072
Age                             37.231831
CityTier                         1.663275
DurationOfPitch                 15.584787
NumberOfPersonVisiting           2.949370
NumberOfFollowups                3.741521
PreferredPropertyStar            3.578488
NumberOfTrips                    3.295300
Passport                         0.295300
PitchSatisfactionScore           3.060804
NumberOfChildrenVisiting         1.223595
MonthlyIncome                23178.464147
dtype: float64

In [107]:
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer = imputer.fit(vacay_df[['Age','DurationOfPitch','NumberOfFollowups','PreferredPropertyStar','NumberOfTrips','NumberOfChildrenVisiting','MonthlyIncome']])
vacay_df[['Age','DurationOfPitch','NumberOfFollowups','PreferredPropertyStar','NumberOfTrips','NumberOfChildrenVisiting','MonthlyIncome']] = imputer.transform(vacay_df[['Age','DurationOfPitch','NumberOfFollowups','PreferredPropertyStar','NumberOfTrips','NumberOfChildrenVisiting','MonthlyIncome']])
vacay_df

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.000000,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,0.0,Manager,20993.0
1,200001,0,49.000000,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Single,2.0,0,3,2.0,Manager,20130.0
2,200002,1,37.000000,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0.0,Executive,17090.0
3,200003,0,33.000000,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Single,2.0,1,5,1.0,Executive,17909.0
4,200004,0,37.622265,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Single,1.0,0,5,0.0,Executive,18468.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4883,204883,1,49.000000,Self Enquiry,3,9.0,Small Business,Male,3,5.0,Deluxe,4.0,Single,2.0,1,1,1.0,Manager,26576.0
4884,204884,1,28.000000,Company Invited,1,31.0,Salaried,Male,4,5.0,Basic,3.0,Single,3.0,1,3,2.0,Executive,21212.0
4885,204885,1,52.000000,Self Enquiry,3,17.0,Salaried,Female,4,4.0,Standard,4.0,Married,7.0,0,1,3.0,Senior Manager,31820.0
4886,204886,1,19.000000,Self Enquiry,3,16.0,Small Business,Male,3,4.0,Basic,3.0,Single,3.0,0,5,2.0,Executive,20289.0


In [109]:
vacay_df.count()

CustomerID                  4888
ProdTaken                   4888
Age                         4888
TypeofContact               4863
CityTier                    4888
DurationOfPitch             4888
Occupation                  4888
Gender                      4888
NumberOfPersonVisiting      4888
NumberOfFollowups           4888
ProductPitched              4888
PreferredPropertyStar       4888
MaritalStatus               4888
NumberOfTrips               4888
Passport                    4888
PitchSatisfactionScore      4888
NumberOfChildrenVisiting    4888
Designation                 4888
MonthlyIncome               4888
dtype: int64

In [110]:
imputer = SimpleImputer(strategy='most_frequent', 
                        missing_values=np.nan)
imputer = imputer.fit(vacay_df[['TypeofContact']])
vacay_df[['TypeofContact']] = imputer.transform(vacay_df[['TypeofContact']])
vacay_df

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.000000,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,0.0,Manager,20993.0
1,200001,0,49.000000,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Single,2.0,0,3,2.0,Manager,20130.0
2,200002,1,37.000000,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0.0,Executive,17090.0
3,200003,0,33.000000,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Single,2.0,1,5,1.0,Executive,17909.0
4,200004,0,37.622265,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Single,1.0,0,5,0.0,Executive,18468.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4883,204883,1,49.000000,Self Enquiry,3,9.0,Small Business,Male,3,5.0,Deluxe,4.0,Single,2.0,1,1,1.0,Manager,26576.0
4884,204884,1,28.000000,Company Invited,1,31.0,Salaried,Male,4,5.0,Basic,3.0,Single,3.0,1,3,2.0,Executive,21212.0
4885,204885,1,52.000000,Self Enquiry,3,17.0,Salaried,Female,4,4.0,Standard,4.0,Married,7.0,0,1,3.0,Senior Manager,31820.0
4886,204886,1,19.000000,Self Enquiry,3,16.0,Small Business,Male,3,4.0,Basic,3.0,Single,3.0,0,5,2.0,Executive,20289.0


In [111]:
vacay_df.to_csv('../Resources/TravelCleanedSkiLearnImputer.csv',index = False )