In [1]:
!pip uninstall scikit-learn -y

Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1


In [2]:
!pip install scikit-learn==1.2.0

Collecting scikit-learn==1.2.0
  Downloading scikit_learn-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.33.0 requires scikit-learn>=1.2.2, but you have scikit-learn 1.2.0 which is incompatible.
mlxtend 0.23.3 requires scikit-learn>=1.3.1, but you have scikit-learn 1.2.0 which is incompatible.
imbalanced-learn 0.13.0 requires scikit-learn<2,>=1.3.2, but you have scikit-learn 1.2.0 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.2.0


In [3]:
!pip install --upgrade xgboost



In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report
import pickle

In [6]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [7]:
def load_data():
    data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
    data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
    data['TotalCharges'].fillna(0, inplace=True)
    categorical_columns = data.select_dtypes(include=['object']).columns.drop(['customerID', 'Churn'])
    data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
    label_encoder = LabelEncoder()
    data_encoded['Churn'] = label_encoder.fit_transform(data_encoded['Churn'])
    return data_encoded

In [8]:
# Feature scaling
def preprocess_data(data):
    numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
    scaler = StandardScaler()
    data[numeric_features] = scaler.fit_transform(data[numeric_features])
    return data, scaler

In [9]:
# Split the dataset
def split_data(data):
    X = data.drop(columns=['customerID', 'Churn'])
    y = data['Churn']
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
# Train models and find the best model
def train_models(X_train, y_train):
    # Logistic Regression
    logistic_params = {'C': [0.1, 1, 10], 'penalty': ['l2'], 'solver': ['lbfgs']}
    logistic_grid = GridSearchCV(
        LogisticRegression(random_state=42, max_iter=500),
        param_grid=logistic_params,
        scoring='roc_auc',
        cv=5
    )
    logistic_grid.fit(X_train, y_train)

    # Random Forest
    rf_params = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    rf_grid = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid=rf_params,
        scoring='roc_auc',
        cv=5,
        n_jobs=-1
    )
    rf_grid.fit(X_train, y_train)

    # XGBoost
    xgb_params = {
        'n_estimators': [100, 200],
        'learning_rate': [0.1, 0.01],
        'max_depth': [3, 6]
    }
    xgb_grid = GridSearchCV(
        XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        param_grid=xgb_params,
        scoring='roc_auc',
        cv=5,
        n_jobs=-1
    )
    xgb_grid.fit(X_train, y_train)

    # Find the best model
    models = {
        'Logistic Regression': (logistic_grid.best_estimator_, logistic_grid.best_score_),
        'Random Forest': (rf_grid.best_estimator_, rf_grid.best_score_),
        'XGBoost': (xgb_grid.best_estimator_, xgb_grid.best_score_)
    }
    best_model_name = max(models, key=lambda x: models[x][1])
    best_model, best_score = models[best_model_name]

    return best_model, best_model_name, best_score

In [12]:
import pickle
feature_names = X_train.columns.tolist()  # X_train is the training dataset used
with open("feature_names.pkl", "wb") as file:
    pickle.dump(feature_names, file)

In [11]:
if __name__ == "__main__":
    # Load and preprocess data
    data = load_data()
    data, scaler = preprocess_data(data)
    X_train, X_test, y_train, y_test = split_data(data)

    # Train and select the best model
    best_model, best_model_name, best_score = train_models(X_train, y_train)
    print(f"The best model is {best_model_name} with an AUC score of {best_score:.2f}")

    # Save the best model
    with open("best_model.pkl", "wb") as file:
        pickle.dump(best_model, file)

    # Save the scaler for preprocessing user inputs
    with open("scaler.pkl", "wb") as file:
        pickle.dump(scaler, file)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['TotalCharges'].fillna(0, inplace=True)


The best model is XGBoost with an AUC score of 0.85


Parameters: { "use_label_encoder" } are not used.



In [13]:
! pip install streamlit -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
!wget -q -O - ipv4.icanhazip.com

34.125.125.102


In [15]:
!npm install -g localtunnel@2.0.2

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K
added 22 packages in 4s
[1G[0K⠴[1G[0K
[1G[0K⠴[1G[0K3 packages are looking for funding
[1G[0K⠴[1G[0K  run `npm fund` for details
[1G[0K⠴[1G[0K

In [16]:
! streamlit run streamlit_main.py & npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.125.102:8501[0m
[0m
your url is: https://warm-impalas-play.loca.lt
[34m  Stopping...[0m
^C
