In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from joblib import dump, load

In [40]:
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [41]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df = df.drop(columns=['customerID'])

In [5]:
# df['TotalCharges'] = df['TotalCharges'].astype(float)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [6]:
df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})

In [7]:
df.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [8]:
# df[df.isna().any(axis=1)]
df = df.dropna()

In [9]:
X = df.drop(columns=['Churn'])
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [11]:
categorical_columns = []
for i in df.columns:
    if df[i].dtype == 'object':
        categorical_columns.append(i)

In [12]:
dump(categorical_columns, '../models/categorical_columns.joblib')

['../models/categorical_columns.joblib']

In [13]:
ordinal = OrdinalEncoder()

In [14]:
ordinal.fit(X_train[categorical_columns])

X_train[categorical_columns] = ordinal.transform(X_train[categorical_columns])

In [15]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)

In [16]:
model = LogisticRegression()

In [17]:
model.fit(X_train, y_train)

In [18]:
dump(ordinal, '../models/Ordinal_Encoder.joblib')
dump(scaler, '../models/Standard_Scaler.joblib')
dump(model, '../models/Logistic_Regression.joblib')

['../models/Logistic_Regression.joblib']

In [19]:
type(X_train)

numpy.ndarray

# Eval

In [20]:
scaler = load('../models/Standard_Scaler.joblib')
ordinal = load('../models/Ordinal_Encoder.joblib')
model = load('../models/Logistic_Regression.joblib')
categorical_columns = load('../models/categorical_columns.joblib')

In [21]:
X_test[categorical_columns] = ordinal.transform(X_test[categorical_columns])

In [22]:
X_test = scaler.transform(X_test)

In [23]:
y_pred = model.predict(X_test)

In [24]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7853589196872779


In [25]:
print("metrix:", confusion_matrix(y_test, y_pred))

metrix: [[920 113]
 [189 185]]


In [26]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [28]:
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

In [29]:
model.fit(X_train, y_train)

In [205]:
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.78


In [206]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.87      0.85      1033
           1       0.60      0.51      0.55       374

    accuracy                           0.78      1407
   macro avg       0.71      0.69      0.70      1407
weighted avg       0.77      0.78      0.77      1407



In [215]:
param_grid = {
    'max_depth': [3, 4, 5, 7, 8],
    'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
    'n_estimators': [50, 100, 200, 300, 400, 500]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss'),
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
Best parameters found:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}


In [216]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [217]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.80


# Refactorig

In [31]:
from typing import Tuple, Dict

In [32]:
def evaluation(model: xgb.XGBRegressor, X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, float]:
    y_pred_test = model.predict(X_test)
    return {
        'accuracy': accuracy_score(y_test, y_pred_test)
    }

In [37]:
from typing import Tuple, Dict, Union


MODEL_PATH = '../models/'

def read_prepare_df(PATH: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    df = pd.read_csv(PATH)

    df = df.drop(columns=['customerID'])
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})

    df = df.dropna()
    X = df.drop(columns=['Churn'])
    y = df['Churn']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test


# def data_preprocessing(X_train: pd.DataFrame) -> np.ndarray:
#     categorical_columns = X_train.select_dtypes(include=['object']).columns

#     dump(categorical_columns, MODEL_PATH + 'categorical_columns.joblib')

#     ordinal = OrdinalEncoder()
#     ordinal.fit(X_train[categorical_columns])

#     X_train[categorical_columns] = ordinal.transform(X_train[categorical_columns])

#     scaler = StandardScaler()
#     scaler.fit(X_train)

#     X_train = scaler.transform(X_train)

#     return X_train


def model_train(model: xgb.XGBRegressor, X_train: np.ndarray,
           X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray) -> Dict[str, float]:
    
    model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
    model.fit(X_train, y_train)

    dump(model, MODEL_PATH + 'Logistic_Regression.joblib')


    # dump(model, MODEL_PATH + 'XGBoost_model.joblib')
    param_grid = {
        'max_depth': [3, 4, 5, 7, 8],
        'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
        'n_estimators': [50, 100, 200, 300, 400, 500]
    }

    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=xgb.XGBClassifier(
            objective='binary:logistic', eval_metric='logloss'),
            param_grid=param_grid,
            scoring='accuracy',
            cv=3,
            verbose=1
        )

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Print the best parameters
    print("Best parameters found: ", grid_search.best_params_)
    
    dump(grid_search, MODEL_PATH + 'XGBoost_classifier.joblib')

    return evaluation(grid_search, X_test, y_test)

In [38]:
ScalerType = Union[StandardScaler, MinMaxScaler]

DATA_PATH = '../data/'

def ordinal_encoding(X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    ordinal = OrdinalEncoder()
    categorical_columns = X_train.select_dtypes(include=['object']).columns

    ordinal.fit(X_train[categorical_columns])
    X_train[categorical_columns] = ordinal.transform(X_train[categorical_columns])
    X_test[categorical_columns] = ordinal.transform(X_test[categorical_columns])

    dump(ordinal, MODEL_PATH + 'Ordinal_Encoder.joblib')
    dump(X.columns, MODEL_PATH + 'columns.joblib')

    return X_train, X_test, y_train, y_test


def standardizing(X_train: pd.DataFrame, X_test: pd.DataFrame, scaler: ScalerType) -> Tuple[np.ndarray, np.ndarray]:
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    dump(scaler, MODEL_PATH + 'Standard_Scaler.joblib')

    return X_train, X_test


def encode_and_update(data: pd.DataFrame, ordinal_path: str) -> OrdinalEncoder:
    ordinal = load(ordinal_path)
    categorical_columns = data.select_dtypes(include=['object']).columns

    for index, col in enumerate(categorical_columns):
        unique_items = set(data[col])
        known_items = set(ordinal.categories_[index])
        new_items = unique_items - known_items

        if new_items:
            ordinal.categories_[index] = np.append(ordinal.categories_[index], list(new_items))

    dump(ordinal, MODEL_PATH + 'Ordinal_Encoder.joblib')
    return ordinal


def load_joblibs() -> Tuple[pd.Index, OrdinalEncoder, ScalerType, xgb.XGBRegressor]:
    cols = load(MODEL_PATH + 'columns.joblib')
    ordinal = load(MODEL_PATH + 'Ordinal_Encoder.joblib')
    standard = load(MODEL_PATH + 'Standard_Scaler.joblib')
    model = load(MODEL_PATH + 'modXGBoost_classifierel.joblib')

    return cols, ordinal, standard, model

In [42]:
from datetime import datetime


def make_predictions(data: pd.DataFrame) -> pd.DataFrame:
    time = datetime.today()
    cols, ordinal, standard, model = load_joblibs()

    ids = data['customerID']
    df_test = data[cols]
    ordinal = encode_and_update(df_test, MODEL_PATH + 'Ordinal_Encoder.joblib')
    df_test[df_test.select_dtypes(include=['object']).columns] = ordinal.transform(
        df_test.select_dtypes(include=['object']))

    df_test = standard.transform(df_test)

    y_pred = model.predict(df_test)
    submission_df = pd.DataFrame({'customerID': ids, 'Churn': y_pred})
    submission_df.to_csv(DATA_PATH + f'prediction_{time}.csv', index=False)

    print(f"Submission file created successfully.\nDateTime: {time}")

    return submission_df