In [1]:
import pandas as pd
import numpy as np

In [2]:
X = pd.read_csv(r'C:\Users\9anan\Desktop\VsCode\Dataset\WA_Fn-UseC_-Telco-Customer-Churn.csv')
df = X.copy()

In [3]:
for cols in X.columns:
    print(f'{cols} : {X[cols].unique()}')

customerID : ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
gender : ['Female' 'Male']
SeniorCitizen : [0 1]
Partner : ['Yes' 'No']
Dependents : ['No' 'Yes']
tenure : [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService : ['No' 'Yes']
MultipleLines : ['No phone service' 'No' 'Yes']
InternetService : ['DSL' 'Fiber optic' 'No']
OnlineSecurity : ['No' 'Yes' 'No internet service']
OnlineBackup : ['Yes' 'No' 'No internet service']
DeviceProtection : ['No' 'Yes' 'No internet service']
TechSupport : ['No' 'Yes' 'No internet service']
StreamingTV : ['No' 'Yes' 'No internet service']
StreamingMovies : ['No' 'Yes' 'No internet service']
Contract : ['Month-to-month' 'One year' 'Two year']
PaperlessBilling : ['Yes' 'No']
PaymentMethod : ['Electronic check' 'Mailed check' 'Bank tr

In [4]:
df = df.drop('customerID', axis=1)

In [5]:
df = df.rename(columns={'gender': 'Gender'})
df = df.rename(columns={'tenure': 'Tenure'})

In [6]:
df['Gender'] = df['Gender'].replace({'Female': 1, 'Male': 0}).astype(np.float32)

In [7]:
df['MultipleLines'] = df['MultipleLines'].replace({'No phone service': 0})

In [8]:
df['Contract'] = df['Contract'].replace({'Month-to-month': 1, 'One year': 12, 'Two year': 24})

In [9]:
df = pd.get_dummies(df, columns=['PaymentMethod'])

In [10]:
df['SeniorCitizen'] = df['SeniorCitizen'].astype(np.float32)

In [11]:
df.dropna(inplace=True)

In [12]:
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)

In [13]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [14]:
df['TotalCharges'] = df['TotalCharges'].astype(np.float32)

In [15]:
df['MonthlyCharges'] = df['MonthlyCharges'].astype(np.float32)

In [16]:
df['Tenure'] = df['Tenure'].astype(np.float32)

In [17]:
col = ['PhoneService',
       'MultipleLines',
       'InternetService',
       'OnlineSecurity',
       'OnlineBackup',
       'DeviceProtection',
       'TechSupport',
       'StreamingTV',
       'StreamingMovies']
df[col] = df[col].replace('No internet service', 0)

In [18]:
col = ['PhoneService',
        'MultipleLines',
        'OnlineSecurity',
        'OnlineBackup',
        'DeviceProtection',
        'TechSupport',
        'StreamingTV',
        'StreamingMovies',
        'PaperlessBilling',]
mapping = {'Yes': 1, 'No': 0}
for i in col:
        df[i] = df[i].replace(mapping)

In [19]:
cols = ['PaymentMethod_Bank transfer (automatic)',
        'PaymentMethod_Credit card (automatic)',
        'PaymentMethod_Electronic check',
        'PaymentMethod_Mailed check']
mapping = {True: 1, False: 0}
for i in cols:
    df[i] = df[i].replace(mapping)

In [20]:
int_col = df.select_dtypes(include='int64').columns
for i in int_col:
    df[i] = df[i].astype(np.float32)

In [21]:
df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0}).astype(np.float32)
df['Partner'] = df['Partner'].replace({'Yes': 1, 'No': 0}).astype(np.float32)
df['Dependents'] = df['Dependents'].replace({'Yes': 1, 'No': 0}).astype(np.float32)

In [22]:
df['InternetService'] = df['InternetService'].astype('category')

In [23]:
df = pd.get_dummies(df, columns=['InternetService']).astype(np.float32)

In [24]:
df = df.rename(columns={'InternetService_No': 'Internet Service(Yes/No)'})

In [25]:
df.dropna(inplace=True)

In [26]:
df['Churn'].value_counts()

Churn
0.0    5163
1.0    1869
Name: count, dtype: int64

In [27]:
from sklearn.model_selection import train_test_split
x = df.drop('Churn', axis=1)
y = df['Churn']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=7)

In [28]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=7)
x_resampled, y_resampled = smote.fit_resample(x, y)
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_resampled.value_counts())

Before SMOTE: Churn
0.0    3618
1.0    1304
Name: count, dtype: int64
After SMOTE: Churn
0.0    5163
1.0    5163
Name: count, dtype: int64


In [29]:
df_resampled = pd.DataFrame(data=x_resampled, columns=x.columns)
df_resampled['Churn'] = y_resampled

In [30]:
from sklearn.preprocessing import MinMaxScaler
cols_to_scale = ['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'Tenure',
       'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'Internet Service(Yes/No)']
scaler = MinMaxScaler()
df_resampled[cols_to_scale] = scaler.fit_transform(df_resampled[cols_to_scale])

In [32]:
df_resampled.dtypes

Gender                                     float32
SeniorCitizen                              float32
Partner                                    float32
Dependents                                 float32
Tenure                                     float32
PhoneService                               float32
MultipleLines                              float32
OnlineSecurity                             float32
OnlineBackup                               float32
DeviceProtection                           float32
TechSupport                                float32
StreamingTV                                float32
StreamingMovies                            float32
Contract                                   float32
PaperlessBilling                           float32
MonthlyCharges                             float32
TotalCharges                               float32
PaymentMethod_Bank transfer (automatic)    float32
PaymentMethod_Credit card (automatic)      float32
PaymentMethod_Electronic check 

In [35]:
df_resampled.shape

(10326, 25)

In [31]:
# df_resampled.to_csv('resampled_data.csv', index=False)