# In this Notebook we will preform Explo ratory data analytics on our data (training and validation CSVs)
* Data Preprocessing (Loading Data using Pandas, Handle Missing Values, Encode Categorical Vars, Normalization/Scaling Features)

In [87]:
import pandas as pd

In [88]:
train_data = pd.read_csv('../data/raw/training.csv',delimiter=',')
train_data.head()

Unnamed: 0,CUSTOMER_ID,COLLEGE,DATA,INCOME,OVERCHARGE,LEFTOVER,HOUSE,LESSTHAN600k,CHILD,JOB_CLASS,REVENUE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,TIME_CLIENT,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION,REPORTED_USAGE_LEVEL,CONSIDERING_CHANGE_OF_PLAN,CHURNED
0,C100000,zero,660.0,19995.0,0,0,897338.0,False,4,3,160.0,155,1,1.2,15,very_unsat,little,considering,STAY
1,C100001,one,317.647059,31477.0,155,15,393396.0,True,0,1,100.0,245,27,2.7,4,unsat,little,considering,LEAVE
2,C100006,zero,208.695652,66742.0,0,13,937197.0,False,4,2,127.0,493,20,2.6,4,avg,very_little,considering,STAY
3,C100008,zero,265.017668,40864.0,183,0,986430.0,False,3,3,86.0,390,13,2.5,12,unsat,very_high,considering,LEAVE
4,C100010,one,440.0,43321.5,200,0,394622.0,True,2,3,77.0,175,18,2.4,10,very_unsat,little,actively_looking_into_it,LEAVE


## Reviewing the data shape and columns types

In [89]:
print(f"the shape of the train data is : ", train_data.shape)
print()
print(train_data.dtypes)

the shape of the train data is :  (11981, 19)

CUSTOMER_ID                     object
COLLEGE                         object
DATA                           float64
INCOME                         float64
OVERCHARGE                       int64
LEFTOVER                         int64
HOUSE                          float64
LESSTHAN600k                    object
CHILD                            int64
JOB_CLASS                        int64
REVENUE                        float64
HANDSET_PRICE                    int64
OVER_15MINS_CALLS_PER_MONTH      int64
TIME_CLIENT                    float64
AVERAGE_CALL_DURATION            int64
REPORTED_SATISFACTION           object
REPORTED_USAGE_LEVEL            object
CONSIDERING_CHANGE_OF_PLAN      object
CHURNED                         object
dtype: object


## Handling Misisng Values

In [90]:
train_data.isna().sum()

CUSTOMER_ID                      0
COLLEGE                          0
DATA                             0
INCOME                           0
OVERCHARGE                       0
LEFTOVER                         0
HOUSE                          635
LESSTHAN600k                   635
CHILD                            0
JOB_CLASS                        0
REVENUE                          0
HANDSET_PRICE                    0
OVER_15MINS_CALLS_PER_MONTH      0
TIME_CLIENT                      0
AVERAGE_CALL_DURATION            0
REPORTED_SATISFACTION            0
REPORTED_USAGE_LEVEL             0
CONSIDERING_CHANGE_OF_PLAN       0
CHURNED                          0
dtype: int64

In [91]:
# Checking the distribution and percentage of missing values
house_miss_perc = train_data['HOUSE'].isna().mean() * 100
lt6k_miss_perc = train_data['LESSTHAN600k'].isna().mean() * 100
print(f"Missing values in 'HOUSE': {house_miss_perc:.2f}%")
print(f"Missing values in 'LESSTHAN600k': {lt6k_miss_perc:.2f}%")

Missing values in 'HOUSE': 5.30%
Missing values in 'LESSTHAN600k': 5.30%


After Handling the NaNs by dropping rows we will have **labled split of data into train test of this sizes :** 
* Training Set: Approximately 9,077 rows
* Test Set: Approximately 2,269 rows

In [92]:
no_nan_data = train_data
no_nan_data.dropna(subset=['HOUSE'], inplace=True)  # Remove rows where 'HOUSE' is NaN
no_nan_data.dropna(subset=['LESSTHAN600k'], inplace=True)  # Remove rows where 'HOUSE' is NaN
no_nan_data.isna().sum()

CUSTOMER_ID                    0
COLLEGE                        0
DATA                           0
INCOME                         0
OVERCHARGE                     0
LEFTOVER                       0
HOUSE                          0
LESSTHAN600k                   0
CHILD                          0
JOB_CLASS                      0
REVENUE                        0
HANDSET_PRICE                  0
OVER_15MINS_CALLS_PER_MONTH    0
TIME_CLIENT                    0
AVERAGE_CALL_DURATION          0
REPORTED_SATISFACTION          0
REPORTED_USAGE_LEVEL           0
CONSIDERING_CHANGE_OF_PLAN     0
CHURNED                        0
dtype: int64

## Getting Desc of the Data

In [93]:
no_nan_data.describe()

Unnamed: 0,DATA,INCOME,OVERCHARGE,LEFTOVER,HOUSE,CHILD,JOB_CLASS,REVENUE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,TIME_CLIENT,AVERAGE_CALL_DURATION
count,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0
mean,503.381766,47554.643266,80.228627,23.857218,506521.68165,2.069364,2.509431,155.49718,382.952142,7.52556,3.080407,5.967654
std,457.877611,22301.121475,84.648846,26.72044,256808.319571,1.251756,1.1219,47.873219,210.254384,8.755728,1.651883,4.365952
min,0.0,10032.5,-2.0,0.0,150015.0,0.0,1.0,50.0,130.0,0.0,0.3,1.0
25%,181.442686,29344.0,0.0,0.0,268511.25,1.0,2.0,121.0,216.25,1.0,1.9,2.0
50%,360.564416,44874.0,56.0,15.0,469775.5,2.0,3.0,149.0,322.0,4.0,2.7,5.0
75%,684.563758,65027.625,171.0,41.0,722464.25,3.0,4.0,182.0,513.75,13.0,3.8,9.0
max,6600.0,105355.5,298.0,89.0,999996.0,6.0,4.0,521.0,899.0,29.0,21.0,15.0


## Saving this non nans data version for EDA before normalizing it 

In [94]:
no_nan_data.to_csv('../data/interim/eda_df.csv', sep=',', encoding='utf-8', index=False, header=True)

Normlizing Data by rescaling values in cols : 
  
    - DATA
    - INCOME
    - OVERCHARGE
    - LEFTOVER
    - HOUSE
    - HANDSET_PRICE
    

In [79]:
import sklearn as sk
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
norm_data = no_nan_data
norm_data[['DATA', 'INCOME', 'OVERCHARGE', 'LEFTOVER', 'HOUSE', 'REVENUE', 'HANDSET_PRICE', 'OVER_15MINS_CALLS_PER_MONTH', 'TIME_CLIENT', 'AVERAGE_CALL_DURATION']] = scaler.fit_transform(norm_data[['DATA', 'INCOME', 'OVERCHARGE', 'LEFTOVER', 'HOUSE', 'REVENUE', 'HANDSET_PRICE', 'OVER_15MINS_CALLS_PER_MONTH', 'TIME_CLIENT', 'AVERAGE_CALL_DURATION']])
norm_data.describe()

Unnamed: 0,DATA,INCOME,OVERCHARGE,LEFTOVER,HOUSE,CHILD,JOB_CLASS,REVENUE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,TIME_CLIENT,AVERAGE_CALL_DURATION
count,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0,11346.0
mean,1.3777490000000001e-17,-2.6302480000000004e-17,-3.131248e-18,-1.0019990000000001e-17,-4.775153e-17,2.069364,2.509431,-3.91406e-18,1.0019990000000001e-17,-5.323121e-18,-7.436713e-18,-6.262495e-18
std,1.000044,1.000044,1.000044,1.000044,1.000044,1.251756,1.1219,1.000044,1.000044,1.000044,1.000044,1.000044
min,-1.099429,-1.682597,-0.9714515,-0.8928846,-1.388282,0.0,1.0,-2.203776,-1.20313,-0.8595392,-1.683248,-1.137867
25%,-0.7031426,-0.8166157,-0.9478235,-0.8928846,-0.9268427,1.0,2.0,-0.7206263,-0.7928942,-0.7453232,-0.7146141,-0.9088118
50%,-0.3119254,-0.1202075,-0.2862377,-0.3314919,-0.1430943,2.0,3.0,-0.1357224,-0.2899099,-0.4026753,-0.2302971,-0.2216462
75%,0.395717,0.7835369,1.072376,0.6415889,0.8409077,3.0,4.0,0.5536287,0.6221208,0.6252686,0.4356388,0.6945747
max,13.31554,2.591951,2.572758,2.438046,1.921651,6.0,4.0,7.635144,2.454506,2.452724,10.84845,2.068906


## Saving normalized data in interim file 

In [83]:
norm_data.to_csv('../data/interim/normalized_df.csv', sep=',', encoding='utf-8', index=False, header=True)

In [85]:
norm_data = pd.read_csv('../data/interim/normalized_df.csv',delimiter=',')
norm_data.head()

Unnamed: 0,CUSTOMER_ID,COLLEGE,DATA,INCOME,OVERCHARGE,LEFTOVER,HOUSE,LESSTHAN600k,CHILD,JOB_CLASS,REVENUE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,TIME_CLIENT,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION,REPORTED_USAGE_LEVEL,CONSIDERING_CHANGE_OF_PLAN,CHURNED
0,C100000,zero,0.342068,-1.235851,-0.947823,-0.892885,1.521888,False,4,3,0.094061,-1.084221,-0.745323,-1.138392,2.068906,very_unsat,little,considering,STAY
1,C100001,one,-0.405661,-0.720966,0.883351,-0.331492,-0.440526,True,0,1,-1.159304,-0.656149,2.224292,-0.230297,-0.450701,unsat,little,considering,LEAVE
2,C100006,zero,-0.64362,0.860414,-0.947823,-0.406344,1.677104,False,4,2,-0.59529,0.523426,1.424781,-0.290837,-0.450701,avg,very_little,considering,STAY
3,C100008,zero,-0.520608,-0.300027,1.214144,-0.892885,1.868824,False,3,3,-1.451756,0.033522,0.625269,-0.351376,1.38174,unsat,very_high,considering,LEAVE
4,C100010,one,-0.138431,-0.189826,1.414983,-0.892885,-0.435752,True,2,3,-1.639761,-0.989094,1.196349,-0.411916,0.92363,very_unsat,little,actively_looking_into_it,LEAVE
