# Inroduction
In this section, the main goal is to discover the data and clean it so it becomes ready for all the work needed

# Imports

In [None]:
# import needed packages/function
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath('../src'))

from data_loader import load_data, save_data

In [None]:
data = load_data("Churn_Modelling.csv", "Raw", False) # load the data

# Data cleaning

In [None]:
data.info() # look for any null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [None]:
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis=1) # drop irrelevant columns

In [None]:
data.head(2) # look at the new structure

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


# Adding age range column

In [None]:
data.describe() # find min and max values of the ages

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [None]:
# add bins and labels for each age range
bins = [18, 24, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ['18-24', '24-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

data['Age_range'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False) # add the new column

In [None]:
data.head(2) # just confirm the new column is working

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Age_range
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1,40-50
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,40-50


# Encode gender values

In [None]:
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1}) # map to encode each geneder

In [None]:
data.head(2) # confirm the gender column is encoded

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Age_range
0,619,France,1,42,2,0.0,1,1,1,101348.88,1,40-50
1,608,Spain,1,41,1,83807.86,1,0,1,112542.58,0,40-50


# Save the new data frame

In [22]:
save_data(data, "processed.csv", False)