In [197]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from joblib import dump, load

In [175]:
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [176]:
df = df.drop(columns=['customerID'])

In [177]:
# df['TotalCharges'] = df['TotalCharges'].astype(float)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [178]:
df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})

In [179]:
df.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [180]:
# df[df.isna().any(axis=1)]
df = df.dropna()

In [181]:
X = df.drop(columns=['Churn'])
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [182]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [183]:
categorical_columns = []
for i in df.columns:
    if df[i].dtype == 'object':
        categorical_columns.append(i)

In [184]:
dump(categorical_columns, '../models/categorical_columns.joblib')

['../models/categorical_columns.joblib']

In [185]:
ordinal = OrdinalEncoder()

In [186]:
ordinal.fit(X_train[categorical_columns])

X_train[categorical_columns] = ordinal.transform(X_train[categorical_columns])

In [187]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)

In [188]:
model = LogisticRegression()

In [189]:
model.fit(X_train, y_train)

In [190]:
dump(ordinal, '../models/Ordinal_Encoder.joblib')
dump(scaler, '../models/Standard_Scaler.joblib')
dump(model, '../models/Logistic_Regression.joblib')

['../models/Logistic_Regression.joblib']

# Eval

In [191]:
scaler = load('../models/Standard_Scaler.joblib')
ordinal = load('../models/Ordinal_Encoder.joblib')
model = load('../models/Logistic_Regression.joblib')
categorical_columns = load('../models/categorical_columns.joblib')

In [192]:
X_test[categorical_columns] = ordinal.transform(X_test[categorical_columns])

In [193]:
X_test = scaler.transform(X_test)

In [194]:
y_pred = model.predict(X_test)

In [195]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7853589196872779


In [200]:
print("metrix:", confusion_matrix(y_test, y_pred))

metrix: [[920 113]
 [189 185]]


In [207]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [203]:
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

In [204]:
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [205]:
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.78


In [206]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.87      0.85      1033
           1       0.60      0.51      0.55       374

    accuracy                           0.78      1407
   macro avg       0.71      0.69      0.70      1407
weighted avg       0.77      0.78      0.77      1407



In [215]:
param_grid = {
    'max_depth': [3, 4, 5, 7, 8],
    'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
    'n_estimators': [50, 100, 200, 300, 400, 500]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss'),
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
Best parameters found:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}


In [216]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [217]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.80


# Refactorig

In [None]:
from typing import Tuple, Dict


MODEL_PATH = '../models/'

def read_prepare_df(PATH: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    df = pd.read_csv(PATH)

    df = df.drop(columns=['customerID'])
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})

    df = df.dropna()
    X = df.drop(columns=['Churn'])
    y = df['Churn']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

def data_preprocessing(X_train: pd.DataFrame):
    categorical_columns = []
    for i in df.columns:
        if df[i].dtype == 'object':
            categorical_columns.append(i)

    dump(categorical_columns, MODEL_PATH + 'categorical_columns.joblib')

    ordinal = OrdinalEncoder()
    ordinal.fit(X_train[categorical_columns])

    X_train[categorical_columns] = ordinal.transform(X_train[categorical_columns])

    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)

    return X_train

def model_train(X_train: , y_train)