In [117]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

## DATA CLEANING AND TIDYING (For visualization and EDA )

1. First let's read the dataset using Pandas's read_csv function and see the shape of it (observations and features) and it's columns and potential problems like missing values, duplicates and check the data constistency(whether a numeric feature is indeed in an numeric datatype).

In [4]:
telecom_data = pd.read_csv("Telecom Customers Churn.csv")

In [5]:
telecom_data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [6]:
telecom_data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

2. I will transform these column names into snake_case for easier usage.

In [8]:
telecom_data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

3. Apparently no null or missing values

In [10]:
telecom_data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

4. All looks okay except for TotalCharges which is of type object which makes me think there is a null value or something that is not encoded right so I will look into it

In [12]:
telecom_data.duplicated().sum()

0

5. No duplicates are apparent.
6. I will now create a functions to convert strings to snake_case and another function to aplly that conversion on all columns in the DataFrame

In [14]:
def convert_to_snake_case(column_name):
    """
    Convert a string to snake_case.

    Parameters:
    column_name (str): The original column name.

    Returns:
    str: The column name converted to snake_case.
    """
  
    snake_case_name = re.sub(r'[\W\s]+', '_', column_name)
    
   
    snake_case_name = re.sub(r'(?<!^)(?=[A-Z])', '_', snake_case_name)
    

    snake_case_name = snake_case_name.lower()
    
   
    snake_case_name = re.sub(r'_+', '_', snake_case_name)
    
   
    snake_case_name = snake_case_name.strip('_')
    
    return snake_case_name

def rename_columns_to_snake_case(df):
    """
    Rename all columns in a DataFrame to snake_case.

    Parameters:
    df (pd.DataFrame): The DataFrame whose columns need to be renamed.

    Returns:
    pd.DataFrame: A new DataFrame with columns renamed to snake_case.
    """
    
    column_mapping = {col: convert_to_snake_case(col) for col in df.columns}
    
    
    df_snake_case = df.rename(columns=column_mapping)
    
    return df_snake_case

In [15]:
telecom_customer_data_snake_case = rename_columns_to_snake_case(telecom_data)

7. Let's see the result

In [17]:
telecom_customer_data_snake_case

Unnamed: 0,customer_i_d,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,...,device_protection,tech_support,streaming_t_v,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [18]:
telecom_customer_data_snake_case.columns

Index(['customer_i_d', 'gender', 'senior_citizen', 'partner', 'dependents',
       'tenure', 'phone_service', 'multiple_lines', 'internet_service',
       'online_security', 'online_backup', 'device_protection', 'tech_support',
       'streaming_t_v', 'streaming_movies', 'contract', 'paperless_billing',
       'payment_method', 'monthly_charges', 'total_charges', 'churn'],
      dtype='object')

8. Let's see the unique values to gather some understanding why is the total_charges and object column and see the others
9. I will create a function to display the unique values and their count.

In [20]:
def display_unique_values_and_counts(df):
    """
    Display unique values and their counts for each column in a DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame whose columns will be analyzed.
    
    Returns:
    None
    """
    for column in df.columns:
        print(f"Column: {column}")
        print(df[column].value_counts(dropna = False))
        print("-" * 50)  


display_unique_values_and_counts(telecom_customer_data_snake_case)

Column: customer_i_d
customer_i_d
7590-VHVEG    1
3791-LGQCY    1
6008-NAIXK    1
5956-YHHRX    1
5365-LLFYV    1
             ..
9796-MVYXX    1
2637-FKFSY    1
1552-AAGRX    1
4304-TSPVK    1
3186-AJIEK    1
Name: count, Length: 7043, dtype: int64
--------------------------------------------------
Column: gender
gender
Male      3555
Female    3488
Name: count, dtype: int64
--------------------------------------------------
Column: senior_citizen
senior_citizen
0    5901
1    1142
Name: count, dtype: int64
--------------------------------------------------
Column: partner
partner
No     3641
Yes    3402
Name: count, dtype: int64
--------------------------------------------------
Column: dependents
dependents
No     4933
Yes    2110
Name: count, dtype: int64
--------------------------------------------------
Column: tenure
tenure
1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: count, Length: 73, dtype: int64
---------

10. Here we can clearly see that 11 of the total_charge entries are blank that's why it doesn't count them as null, but they arent numbers either, so I shall remove them, but let's make sure 

In [22]:
telecom_customer_data_snake_case['total_charges'] = pd.to_numeric(telecom_customer_data_snake_case['total_charges'], errors='coerce')
total_charges_issues = telecom_customer_data_snake_case['total_charges'].isnull().sum()
print(f"Number of invalid 'TotalCharges' entries: {total_charges_issues}")

Number of invalid 'TotalCharges' entries: 11


11. Let's drop those entries

In [24]:
telecom_customer_data_cleaned = telecom_customer_data_snake_case.dropna(subset=['total_charges'])

In [25]:
telecom_customer_data_cleaned 

Unnamed: 0,customer_i_d,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,...,device_protection,tech_support,streaming_t_v,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


12. Let's remove the customer_i_d column as it doesn't provide any valuable information 

In [27]:
telecom_data_cleaned = telecom_customer_data_cleaned.drop(['customer_i_d'], axis=1)

13. Let's save the cleaned data in new csv 

In [29]:
telecom_data_cleaned.to_csv("test_cleaned.csv", index = False)

14. Last check to see if all is good in the new file 

In [31]:
new_data = pd.read_csv('test_cleaned.csv')

In [32]:
new_data

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_t_v,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7028,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7029,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7030,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


In [33]:
new_data.isnull().sum()

gender               0
senior_citizen       0
partner              0
dependents           0
tenure               0
phone_service        0
multiple_lines       0
internet_service     0
online_security      0
online_backup        0
device_protection    0
tech_support         0
streaming_t_v        0
streaming_movies     0
contract             0
paperless_billing    0
payment_method       0
monthly_charges      0
total_charges        0
churn                0
dtype: int64

In [34]:
new_data.dtypes

gender                object
senior_citizen         int64
partner               object
dependents            object
tenure                 int64
phone_service         object
multiple_lines        object
internet_service      object
online_security       object
online_backup         object
device_protection     object
tech_support          object
streaming_t_v         object
streaming_movies      object
contract              object
paperless_billing     object
payment_method        object
monthly_charges      float64
total_charges        float64
churn                 object
dtype: object

In [35]:
new_data.duplicated().sum()

22

15. Here we see duplicates so let's remove them

In [37]:
telecom_data_cleaned_no_duplicates = new_data.drop_duplicates()

In [38]:
telecom_data_cleaned_no_duplicates

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_t_v,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7028,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7029,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7030,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


In [39]:
telecom_data_cleaned_no_duplicates.reset_index(drop=True, inplace=True)

In [40]:
telecom_data_cleaned_no_duplicates

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_t_v,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7005,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7006,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7007,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7008,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


In [41]:
telecom_data_cleaned_no_duplicates.duplicated().sum()

0

16. This data is now clean and ready to be visualized and explored in detail. No null values, no duplicates, consistant data types and easily usable column names.
17. Let's now create a new csv file before we continue with preparing it for machine learning deployment

In [43]:
telecom_data_cleaned_no_duplicates.to_csv("cleaned_data_telecom.csv",index=False)

## Data preparing for ML algorithm

18. Okay now let's continue by preparing the data for machine learning deployment , beggining with one-hot encoding as it would work better than manually mapping the categories to values like 0,1,2 will introduce a FALSE sense of order in places where there ain't no order ("None" < "DSL" < "Fiber optic") which is not the case they are seperate things and do not follow any natural order and the model could assume false relationships.
19. One-hot encoding ensures that all categories are treated equally by the model, without implying any rank or order between categories. This is especially important for models like logistic regression or neural networks.
20. For that we will have to convert the categorical columns into dummies which in this case I assume would come with the addition of new features, as the encoding works with (True(1) and False(0))and in some columns we have 3 values.

In [46]:
telecom_data_cleaned_no_duplicates

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_t_v,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7005,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7006,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7007,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7008,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


21. Let's map the categorical columns so that we can easily use them in the conversion

In [48]:
categorical_columns = [
    'gender', 'partner', 'dependents', 'phone_service', 'multiple_lines',
    'internet_service', 'online_security', 'online_backup', 'device_protection',
    'tech_support', 'streaming_t_v', 'streaming_movies', 'contract',
    'paperless_billing', 'payment_method', 'churn'
]

22. We will get the dummies of these categorical columns which will give us a boolean of True and False or 1 and 0 

In [50]:
telecom_data_encoded = pd.get_dummies(telecom_data_cleaned_no_duplicates, columns=categorical_columns, drop_first=True)

In [51]:
telecom_data_encoded

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_No phone service,multiple_lines_Yes,...,streaming_t_v_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_One year,contract_Two year,paperless_billing_Yes,payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,churn_Yes
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
1,0,34,56.95,1889.50,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,True
3,0,45,42.30,1840.75,True,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
4,0,2,70.70,151.65,False,False,False,True,False,False,...,False,False,False,False,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7005,0,24,84.80,1990.50,True,True,True,True,False,True,...,True,False,True,True,False,True,False,False,True,False
7006,0,72,103.20,7362.90,False,True,True,True,False,True,...,True,False,True,True,False,True,True,False,False,False
7007,0,11,29.60,346.45,False,True,True,False,True,False,...,False,False,False,False,False,True,False,True,False,False
7008,1,4,74.40,306.60,True,True,False,True,False,True,...,False,False,False,False,False,True,False,False,True,True


23. As we can see it clearly added new features and that's normal as there are multiple categorical variables with multiple categories.
24. For each categorical variable, one-hot encoding creates a new binary column for each unique category (minus one to avoid multicollinearity, if drop_first=True is used).
25. For example, if you have a categorical variable like contract with three categories: "Month-to-month," "One year," and "Two year," one-hot encoding will add two new columns (contract_One year and contract_Two year), and it will imply that if both are 0, the contract is "Month-to-month."
26. Yes increasing the number of features will lead to longer process of training, especially if it's an SVM(Support Vector Machine), but for that we can do Feature selection and not use the features that have low or no impact on the model or a posteriori do a Dimensionality Reduction. But we will get back to that when we start model training and validation.
27. Now another thing we should do beforehand is SCALING the numerical values as it's recommended to use for Gradient-based algorithms like logistic regression, neural networks, support vector machines. These algorithms are sensitive to feature magnitudes. Without scaling, features with larger ranges (e.g., total_charges) can dominate the optimization process, leading to poor model performance.

In [53]:
numerical_columns = ['tenure', 'monthly_charges', 'total_charges']

28. We will be using StandardScaler for scaling the numerical values

In [55]:
scaler = StandardScaler()

In [56]:
telecom_data_encoded[numerical_columns] = scaler.fit_transform(telecom_data_encoded[numerical_columns])

In [57]:
telecom_data_encoded

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_No phone service,multiple_lines_Yes,...,streaming_t_v_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_One year,contract_Two year,paperless_billing_Yes,payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,churn_Yes
0,0,-1.285566,-1.165523,-0.997284,False,True,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
1,0,0.060346,-0.264071,-0.176848,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
2,0,-1.244781,-0.367189,-0.962740,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,True
3,0,0.508983,-0.751387,-0.198355,True,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
4,0,-1.244781,0.193308,-0.943549,False,False,False,True,False,False,...,False,False,False,False,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7005,0,-0.347506,0.662329,-0.132289,True,True,True,True,False,True,...,True,False,True,True,False,True,False,False,True,False
7006,0,1.610184,1.274384,2.237896,False,True,True,True,False,True,...,True,False,True,True,False,True,True,False,False,False
7007,0,-0.877714,-1.173839,-0.857607,False,True,True,False,True,False,...,False,False,False,False,False,True,False,True,False,False
7008,1,-1.163210,0.316384,-0.875188,True,True,False,True,False,True,...,False,False,False,False,False,True,False,False,True,True


29. Now let's save this into a new csv and then proceed with feature selection validation and removing the churn as its TARGET FEATURE and so on..... IF i am gonna proceed with that project in general\

In [59]:
telecom_data_encoded.to_csv("telecom_data_cleaned_encoded.csv", index=False)

30. Let's split the data into test an train sets, I will be using cross-validation for performance evaluation so I wont be using a validation set.

In [100]:
X = telecom_data_encoded.drop('churn_Yes', axis=1)
y = telecom_data_encoded['churn_Yes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [121]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)


svm_clf = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42)
svm_clf.fit(X_train, y_train)


y_pred_val = svm_clf.predict(X_val)
print(classification_report(y_val, y_pred_val))
print(f"SVM AUC-ROC on Validation Set: {roc_auc_score(y_val, y_pred_val):.4f}")


              precision    recall  f1-score   support

       False       0.91      0.76      0.83      1014
        True       0.56      0.81      0.66       388

    accuracy                           0.77      1402
   macro avg       0.73      0.78      0.74      1402
weighted avg       0.81      0.77      0.78      1402

SVM AUC-ROC on Validation Set: 0.7816


In [130]:
import pandas as pd

comparison_df = pd.DataFrame({
    'Actual': y_val,
    'Predicted': y_pred_val
})
print(comparison_df.head(20)) 


      Actual  Predicted
665     True       True
5841   False      False
5070   False       True
4012   False      False
1937   False      False
2953    True      False
6925   False      False
1612   False       True
5128   False      False
6230   False      False
5182   False      False
5358   False      False
3018    True       True
1259   False      False
4843    True       True
1040   False      False
2895    True       True
1354    True       True
2508   False       True
4158   False      False


## 16 out of 20/ I will be working on diff models and parameters tomorrow

In [128]:
from sklearn.model_selection import cross_val_score

svm_clf = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42)

# Cross-validation on the training set
cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=5, scoring='roc_auc')

# Print the cross-validation results
print(f"Cross-Validation ROC-AUC Scores: {cv_scores}")
print(f"Mean ROC-AUC from Cross-Validation: {cv_scores.mean():.4f}")


Cross-Validation ROC-AUC Scores: [0.8018258  0.79284399 0.81248751 0.85023838 0.85853554]
Mean ROC-AUC from Cross-Validation: 0.8232
