# Data cleaning and merging

- In this notebook, we will clean the data and merge the data from the different datasets. 

## Client Profiles (client_profiles.csv)

In [1]:
#import libraries to clean data and do the merging

import pandas as pd
import numpy as np

#read the data

client_profiles = "/Users/alexandreribeiro/Desktop/Ironhacks Booty/5th week/Project/Datasets/df_final_demo.csv"
digital_footprints_1 = "/Users/alexandreribeiro/Desktop/Ironhacks Booty/5th week/Project/Datasets/df_final_web_data_pt_1.csv"
digital_footprints_2 = "/Users/alexandreribeiro/Desktop/Ironhacks Booty/5th week/Project/Datasets/df_final_web_data_pt_2.csv"
experiment_roster = "/Users/alexandreribeiro/Desktop/Ironhacks Booty/5th week/Project/Datasets/df_final_experiment_clients.csv"

#read the data

df_client_profiles = pd.read_csv(client_profiles)

df_client_profiles.sample(5)


Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
46166,5775881,10.0,124.0,66.0,F,2.0,109408.77,0.0,4.0
19457,9477401,15.0,191.0,45.5,M,2.0,71620.07,1.0,4.0
49866,8164889,19.0,238.0,34.5,F,2.0,73464.11,3.0,6.0
69370,9024342,6.0,77.0,22.0,U,2.0,17365.74,1.0,1.0
932,8479157,9.0,118.0,63.5,M,3.0,24800.83,6.0,9.0


In [6]:
df_client_profiles['clnt_tenure_yr'] = df_client_profiles['clnt_tenure_yr'].astype(int)
df_client_profiles['clnt_tenure_mnth'] = df_client_profiles['clnt_tenure_mnth'].astype(int)
df_client_profiles['clnt_age'] = df_client_profiles['clnt_age'].astype(int)
df_client_profiles['calls_6_mnth'] = df_client_profiles['calls_6_mnth'].astype(int)
df_client_profiles['logons_6_mnth'] = df_client_profiles['logons_6_mnth'].astype(int)
df_client_profiles['num_accts'] = df_client_profiles['num_accts'].astype(int)

df_client_profiles.sample(5)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
49789,7766714,4,58,61,M,2,628460.61,6,9
197,8676893,9,116,60,F,2,88968.26,1,5
46652,2105237,4,58,24,U,2,44361.61,1,4
64253,466390,8,107,43,F,2,37738.06,1,1
57895,5455641,7,95,48,U,2,72661.49,6,6


In [12]:
# handling different values on gendr

df_client_profiles.gendr.value_counts()

# includint the X values in U

df_client_profiles['gendr'] = df_client_profiles['gendr'].replace('X', 'U')

df_client_profiles.gendr.value_counts()

gendr
U    24125
M    23724
F    22746
Name: count, dtype: int64

In [13]:
df_client_profiles.shape

(70609, 9)

In [2]:
df_client_profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   client_id         70609 non-null  int64  
 1   clnt_tenure_yr    70595 non-null  float64
 2   clnt_tenure_mnth  70595 non-null  float64
 3   clnt_age          70594 non-null  float64
 4   gendr             70595 non-null  object 
 5   num_accts         70595 non-null  float64
 6   bal               70595 non-null  float64
 7   calls_6_mnth      70595 non-null  float64
 8   logons_6_mnth     70595 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 4.8+ MB


In [5]:
# checking for missing values

df_client_profiles.isnull().sum()

client_id           0
clnt_tenure_yr      0
clnt_tenure_mnth    0
clnt_age            0
gendr               0
num_accts           0
bal                 0
calls_6_mnth        0
logons_6_mnth       0
dtype: int64

In [4]:
# drop the missing values

df_client_profiles = df_client_profiles.dropna()

df_client_profiles.isnull().sum()

client_id           0
clnt_tenure_yr      0
clnt_tenure_mnth    0
clnt_age            0
gendr               0
num_accts           0
bal                 0
calls_6_mnth        0
logons_6_mnth       0
dtype: int64

In [9]:
# checking for duplicates   

duplicates = df_client_profiles.duplicated()

duplicates.sum()

0

### Merging the digital_footprints_1 and digital_footprints_2 datasets before cleaning

In [7]:
# Load the datasets

df_digital_footprint1 = pd.read_csv(digital_footprints_1)
df_digital_footprint2 = pd.read_csv(digital_footprints_2)

# Convert date_time columns to datetime format

df_digital_footprint1['date_time'] = pd.to_datetime(df_digital_footprint1['date_time'])
df_digital_footprint2['date_time'] = pd.to_datetime(df_digital_footprint2['date_time'])

# Merge the datasets

df_digital_footprint = pd.concat([df_digital_footprint1, df_digital_footprint2])

# Display a sample the merged dataset

df_digital_footprint.sample(5)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
43115,2465411,159864630_7968564641,108733913_69527591003_427656,step_1,2017-04-18 00:03:57
118844,7813768,837016382_9283639675,752187745_27373231581_812601,start,2017-05-17 20:50:35
367541,7652267,156762456_89862542886,504214978_85623911124_656377,step_3,2017-05-24 13:43:54
146750,124118,377371820_20980760755,849880564_65740980596_760185,confirm,2017-06-12 01:24:00
284231,9063578,124600075_48237089912,538616726_63355498517_437012,confirm,2017-03-29 19:39:21


In [8]:
# checking for missing values

df_digital_footprint.isnull().sum()

client_id       0
visitor_id      0
visit_id        0
process_step    0
date_time       0
dtype: int64

In [9]:
df_digital_footprint.shape

(755405, 5)

In [10]:
# duplicates

duplicates = df_digital_footprint.duplicated()

duplicates.sum()

10764

In [11]:
#show duplicates for the same client_id

df_digital_footprint[df_digital_footprint.duplicated(subset=['client_id', 'date_time'], keep=False)].sort_values(by=['client_id', 'date_time'])

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
288214,1531,934069404_34543643308,110350144_67985219545_904677,start,2017-06-02 22:32:08
288215,1531,934069404_34543643308,110350144_67985219545_904677,start,2017-06-02 22:32:08
288212,1531,934069404_34543643308,110350144_67985219545_904677,start,2017-06-02 22:32:28
288213,1531,934069404_34543643308,110350144_67985219545_904677,start,2017-06-02 22:32:28
296388,2078,585735301_21309149782,900598259_99565669243_552843,start,2017-05-06 08:29:00
...,...,...,...,...,...
119532,9998346,292425655_16607136645,189177304_69869411700_783154,step_3,2017-03-29 15:37:28
119533,9998346,292425655_16607136645,189177304_69869411700_783154,confirm,2017-03-29 15:37:28
119534,9998346,292425655_16607136645,189177304_69869411700_783154,step_3,2017-03-29 15:37:28
63673,9998447,747535871_38029188908,309540451_47388672022_606106,start,2017-06-03 19:17:15


In [12]:
# checking duplicates when all columns are equal

duplicates = df_digital_footprint.duplicated()

duplicates.sum()

10764

In [13]:
# Since we are dealing with genuine duplicates, we will drop them  

df_digital_footprint = df_digital_footprint.drop_duplicates()

df_digital_footprint.shape



(744641, 5)

### Experiment Roster (final_experiment_clients.csv)

In [28]:
#load the experiment roster

df_experiment_roster = pd.read_csv(experiment_roster)

df_experiment_roster.sample(5)

Unnamed: 0,client_id,Variation
32482,5296140,Control
48547,3928076,Control
38496,1936892,Test
9523,819230,Control
7013,5754482,Test


In [39]:
df_experiment_roster.shape

(50500, 2)

In [30]:
# checking for missing values

df_experiment_roster.isnull().sum()

client_id        0
Variation    20109
dtype: int64

In [32]:
# checking values for the column variation

df_experiment_roster.Variation.value_counts()

Variation
Test       26968
Control    23532
Name: count, dtype: int64

In [33]:
# checking for duplicates

duplicates = df_experiment_roster.duplicated()

duplicates.sum()

0

In [35]:
# checking if we have repeated client_ids

df_experiment_roster['client_id'].duplicated().sum()



0

In [38]:
# Dropping missing values

df_experiment_roster = df_experiment_roster.dropna()

df_experiment_roster.isnull().sum()


client_id    0
Variation    0
dtype: int64

## Merging the 3 datasets (client_profiles.csv, digital_footprints.csv, final_experiment_clients.csv)

In [None]:
# Merging the 3 datasets

df_client_profiles

df_digital_footprint

df_experiment_roster


In [40]:
# Merge df_experiment_clients_cleaned with df_demo_cleaned

df_merged = pd.merge(df_client_profiles, df_experiment_roster, on='client_id', how='inner')

In [42]:
# Merge the result with df_web_data_cleaned

df_merged_final = pd.merge(df_merged, df_digital_footprint, on='client_id', how='inner')

df_merged_final.sample(5)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,visitor_id,visit_id,process_step,date_time
52967,3707510,5,64,27,U,3,66145.68,4,7,Test,557922452_4802234317,549102425_70713811638_244044,step_1,2017-03-31 23:06:23
165904,753431,4,58,35,U,2,46636.92,6,9,Test,266826656_69372724338,971623695_49167919982_71153,confirm,2017-04-12 12:02:16
226057,682400,12,151,55,F,2,38175.0,5,8,Test,111573208_2851624347,571932054_36809436211_992026,confirm,2017-05-28 18:02:22
216807,3833380,15,183,52,F,2,23936.78,2,5,Control,306186616_27853449480,399848468_48378719228_7879,start,2017-05-15 16:36:51
257083,6666495,9,112,30,M,2,78663.0,6,9,Test,752624787_70761041208,805312647_58120112371_367424,step_1,2017-04-05 11:16:31


In [48]:
df_merged_final.shape

(317123, 14)

In [49]:
# export the final dataset

df_merged_final.to_csv("/Users/alexandreribeiro/Desktop/Ironhacks Booty/5th week/Project/Datasets/df_merged_final.csv", index=False)
