In [9]:
import pandas as pd
import numpy as np
import klib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import KBinsDiscretizer


In [5]:
df=pd.read_csv("Payment_Info.csv")

In [6]:
df.head(3)

Unnamed: 0,customer_id,contract,paperless_billing,payment_method,monthly_ charges,avg_monthly_long_distance_charges,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue
0,0002-ORFBO,One Year,Yes,Mailed check,65.6,42.39,593.3,0.0,0,381.51,974.81
1,0003-MKNFE,Month-to-Month,No,Mailed check,59.9,10.69,542.4,38.33,10,96.21,610.28
2,0004-TLHLJ,Month-to-Month,Yes,Electronic check,73.9,33.65,280.85,0.0,0,134.6,415.45


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   customer_id                        7043 non-null   object 
 1   contract                           7043 non-null   object 
 2   paperless_billing                  7043 non-null   object 
 3   payment_method                     7043 non-null   object 
 4   monthly_ charges                   7043 non-null   float64
 5   avg_monthly_long_distance_charges  7043 non-null   float64
 6   total_charges                      7043 non-null   float64
 7   total_refunds                      7043 non-null   float64
 8   total_extra_data_charges           7043 non-null   int64  
 9   total_long_distance_charges        7043 non-null   float64
 10  total_revenue                      7043 non-null   float64
dtypes: float64(6), int64(1), object(4)
memory usage: 605.4+ 

In [10]:
# Standardlaztion for column names and Cleaning dublicated rows with Klib and decrease memory usage of dataset
df=klib.data_cleaning(df)



Shape of cleaned data: (7043, 11) - Remaining NAs: 0


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.34 MB (-57.63%)



In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   customer_id                        7043 non-null   string  
 1   contract                           7043 non-null   category
 2   paperless_billing                  7043 non-null   category
 3   payment_method                     7043 non-null   category
 4   monthly_charges                    7043 non-null   float32 
 5   avg_monthly_long_distance_charges  7043 non-null   float32 
 6   total_charges                      7043 non-null   float32 
 7   total_refunds                      7043 non-null   float32 
 8   total_extra_data_charges           7043 non-null   int16   
 9   total_long_distance_charges        7043 non-null   float32 
 10  total_revenue                      7043 non-null   float32 
dtypes: category(3), float32(6), int16(1), strin

# Feature Transforming

In [50]:
# selecting Numerical columns

numerical_col=df.select_dtypes(include=["int16","float32"]).columns.tolist()
print(f"Numerical columns:\n{numerical_col}")
print(f"\nTotal Number of Numerical columns:\n{len(numerical_col)}")


Numerical columns:
['monthly_charges', 'avg_monthly_long_distance_charges', 'monthly_charges_equal_width', 'avg_monthly_long_distance_charges_equal_width', 'sqrt_total_refunds_equal_width', 'sqrt_total_extra_data_charges_equal_width', 'sqrt_total_long_distance_charges_equal_width', 'sqrt_total_charges_equal_width', 'sqrt_total_revenue_equal_width']

Total Number of Numerical columns:
9


#Mathemetical transforming

In [14]:
# calculating skewness for numerical columns

skewness=df[numerical_col].skew().sort_values(ascending=False)
print(f"Skewness of numerical features:\n{skewness}")

Skewness of numerical features:
total_refunds                        4.328517
total_extra_data_charges             4.091209
total_long_distance_charges          1.238282
total_charges                        0.963791
total_revenue                        0.919410
avg_monthly_long_distance_charges    0.049176
monthly_charges                     -0.220524
dtype: float64


In [16]:
# indentify features that need mathemetiucal transformation

skewed_features=skewness[abs(skewness)>0.5]
skewed_features

total_refunds                  4.328517
total_extra_data_charges       4.091209
total_long_distance_charges    1.238282
total_charges                  0.963791
total_revenue                  0.919410
dtype: float64

In [23]:
# Log or square root texnic

df["sqrt_total_refunds"]=np.sqrt(df["total_refunds"])
df["sqrt_total_extra_data_charges"]=np.sqrt(df["total_extra_data_charges"])
df["sqrt_total_long_distance_charges"]=np.sqrt(df["total_long_distance_charges"])
df["sqrt_total_charges"]=np.sqrt(df["total_charges"])
df["sqrt_total_revenue"]=np.sqrt(df["total_revenue"])


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   customer_id                        7043 non-null   string  
 1   contract                           7043 non-null   category
 2   paperless_billing                  7043 non-null   category
 3   payment_method                     7043 non-null   category
 4   monthly_charges                    7043 non-null   float32 
 5   avg_monthly_long_distance_charges  7043 non-null   float32 
 6   total_charges                      7043 non-null   float32 
 7   total_refunds                      7043 non-null   float32 
 8   total_extra_data_charges           7043 non-null   int16   
 9   total_long_distance_charges        7043 non-null   float32 
 10  total_revenue                      7043 non-null   float32 
 11  sqrt_total_refunds                 7043 non

In [28]:
df["total_charges"]

0        593.299988
1        542.400024
2        280.850006
3       1237.849976
4        267.399994
           ...     
7038     742.900024
7039    1873.699951
7040      92.750000
7041    4627.649902
7042    3707.600098
Name: total_charges, Length: 7043, dtype: float32

In [26]:
df["sqrt_total_charges"]

0       24.357750
1       23.289484
2       16.758579
3       35.183094
4       16.352369
          ...    
7038    27.256193
7039    43.286255
7040     9.630680
7041    68.026833
7042    60.890064
Name: sqrt_total_charges, Length: 7043, dtype: float32

In [29]:
# dropping old columns

df.drop(["total_charges","total_refunds","total_extra_data_charges","total_long_distance_charges","total_revenue"],axis=1,inplace=True)




In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   customer_id                        7043 non-null   string  
 1   contract                           7043 non-null   category
 2   paperless_billing                  7043 non-null   category
 3   payment_method                     7043 non-null   category
 4   monthly_charges                    7043 non-null   float32 
 5   avg_monthly_long_distance_charges  7043 non-null   float32 
 6   sqrt_total_refunds                 7043 non-null   float32 
 7   sqrt_total_extra_data_charges      7043 non-null   float32 
 8   sqrt_total_long_distance_charges   7043 non-null   float32 
 9   sqrt_total_charges                 7043 non-null   float32 
 10  sqrt_total_revenue                 7043 non-null   float32 
dtypes: category(3), float32(7), string(1)
memor

In [31]:
df.tail(3)

Unnamed: 0,customer_id,contract,paperless_billing,payment_method,monthly_charges,avg_monthly_long_distance_charges,sqrt_total_refunds,sqrt_total_extra_data_charges,sqrt_total_long_distance_charges,sqrt_total_charges,sqrt_total_revenue
7040,9992-UJOEL,Month-to-Month,Yes,Mailed check,50.299999,18.620001,0.0,0.0,6.102458,9.63068,11.401316
7041,9993-LHIEB,Two Year,No,Mailed check,67.849998,2.12,0.0,0.0,11.918054,68.026833,69.062943
7042,9995-HOTOH,Two Year,No,Electronic check,59.0,0.0,0.0,0.0,0.0,60.890064,60.890064


#Discretization transform

In [33]:
# selecting continues features

continue_features=df.select_dtypes(include=["float32"]).columns.tolist()
print(f"Continues number features for potential discretization:\n{continue_features}")

Continues number features for potential discretization:
['monthly_charges', 'avg_monthly_long_distance_charges', 'sqrt_total_refunds', 'sqrt_total_extra_data_charges', 'sqrt_total_long_distance_charges', 'sqrt_total_charges', 'sqrt_total_revenue']


In [41]:
equal_width=KBinsDiscretizer(n_bins=5,encode="ordinal",strategy="uniform")

for featue in continue_features:
    df[f"{featue}_equal_width"]=equal_width.fit_transform(df[[featue]])

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 18 columns):
 #   Column                                         Non-Null Count  Dtype   
---  ------                                         --------------  -----   
 0   customer_id                                    7043 non-null   string  
 1   contract                                       7043 non-null   category
 2   paperless_billing                              7043 non-null   category
 3   payment_method                                 7043 non-null   category
 4   monthly_charges                                7043 non-null   float32 
 5   avg_monthly_long_distance_charges              7043 non-null   float32 
 6   sqrt_total_refunds                             7043 non-null   float32 
 7   sqrt_total_extra_data_charges                  7043 non-null   float32 
 8   sqrt_total_long_distance_charges               7043 non-null   float32 
 9   sqrt_total_charges                       

In [47]:
print(df["sqrt_total_refunds"])
print(df["sqrt_total_refunds_equal_width"])

0       0.000000
1       6.191123
2       0.000000
3       0.000000
4       0.000000
          ...   
7038    0.000000
7039    0.000000
7040    0.000000
7041    0.000000
7042    0.000000
Name: sqrt_total_refunds, Length: 7043, dtype: float32
0       0.0
1       4.0
2       0.0
3       0.0
4       0.0
       ... 
7038    0.0
7039    0.0
7040    0.0
7041    0.0
7042    0.0
Name: sqrt_total_refunds_equal_width, Length: 7043, dtype: float32


In [44]:
df["sqrt_total_refunds"]

0       0.000000
1       6.191123
2       0.000000
3       0.000000
4       0.000000
          ...   
7038    0.000000
7039    0.000000
7040    0.000000
7041    0.000000
7042    0.000000
Name: sqrt_total_refunds, Length: 7043, dtype: float32

In [46]:
print(df["monthly_charges"])
print(df["monthly_charges_equal_width"])

0       65.599998
1       59.900002
2       73.900002
3       98.000000
4       83.900002
          ...    
7038    55.150002
7039    85.099998
7040    50.299999
7041    67.849998
7042    59.000000
Name: monthly_charges, Length: 7043, dtype: float32
0       2.0
1       2.0
2       2.0
3       3.0
4       3.0
       ... 
7038    1.0
7039    3.0
7040    1.0
7041    2.0
7042    2.0
Name: monthly_charges_equal_width, Length: 7043, dtype: float32


In [48]:
# dropping old colums
df.drop(["sqrt_total_refunds","sqrt_total_extra_data_charges","sqrt_total_long_distance_charges","sqrt_total_charges","sqrt_total_revenue"],axis=1,inplace=True)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 13 columns):
 #   Column                                         Non-Null Count  Dtype   
---  ------                                         --------------  -----   
 0   customer_id                                    7043 non-null   string  
 1   contract                                       7043 non-null   category
 2   paperless_billing                              7043 non-null   category
 3   payment_method                                 7043 non-null   category
 4   monthly_charges                                7043 non-null   float32 
 5   avg_monthly_long_distance_charges              7043 non-null   float32 
 6   monthly_charges_equal_width                    7043 non-null   float32 
 7   avg_monthly_long_distance_charges_equal_width  7043 non-null   float32 
 8   sqrt_total_refunds_equal_width                 7043 non-null   float32 
 9   sqrt_total_extra_data_charges_equal_width