### # 🚀 Load the train and test datasets and Libraries

In [3]:

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Import the data set
import pandas as pd

# Path to the CSV file
train_path = '/shared_data/Train_Churn.csv'
#test_path = '/shared_data/Churn_data.csv'

# Read the CSV file into a DataFrame
train = pd.read_csv(train_path)
#test = pd.read_csv(test_path)

y = train ['Churn']
train.drop(['customerID'], axis = 1, inplace = True)

#test_ids = test['customerID']

# Display the first few rows of the DataFrame
train.head()

Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Monthly,Yes,Manual,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Manual,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Monthly,Yes,Manual,53.85,108.15,Yes
3,Male,0,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Monthly,Yes,Manual,70.7,151.65,Yes


### Data Preprocessing

In [4]:
# Convert 'TotalCharges' to numeric, setting errors='coerce' to handle non-numeric values
train['TotalCharges'] = pd.to_numeric(train['TotalCharges'], errors='coerce')
# test['TotalCharges'] = pd.to_numeric(test['TotalCharges'], errors='coerce')

# Define the numerical and categorical features
numerical_features = ['Tenure', 'MonthlyCharges', 'TotalCharges']  
categorical_features = ['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                        'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

# Create the preprocessor pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', KNNImputer(n_neighbors=2, weights='distance')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])
preprocessor

### 🏆 Model Building & Evaluation

In [9]:
### Create the Model pipeline, Split data and Train *********************************************
pipeline_class = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

# Fit the pipeline on training data
pipeline_class.fit(X_train, y_train)

### Model Evaluation ****************************************************************************
# Evaluate classifier on validation set
y_pred = pipeline_class.predict(X_test)
val_accuracy = accuracy_score(y_test, y_pred)

# Evaluate classifier on training set
y_train_pred = pipeline_class.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

evaluation_df = pd.DataFrame({
    'Set': ['Training', 'Validation'],
    'Metric': ['Accuracy', 'Accuracy'],
    'Model': ['Classifier', 'Classifier'],
    'Score': [train_accuracy, val_accuracy]
})

# Display the evaluation DataFrame
evaluation_df

Unnamed: 0,Set,Metric,Model,Score
0,Training,Accuracy,Classifier,0.996875
1,Validation,Accuracy,Classifier,0.79



### Final Model Training and Prediction


In [10]:
## Training (on Entire Training Dataset)
pipeline_class.fit(train, y)

### Prediction on Test Data
    ## Model is designed to take the prediction through the preproceesing pipeline

# classifier_predictions = pipeline_class.predict(test)
# output = pd.DataFrame({'customerID': test_ids, 'ChurnPredictions': classifier_predictions})
# output.head()


### Connect to MySQL Database


In [11]:
!pip install pymysql -q
import pymysql

In [12]:
# # connect to MySQL Database
# connection = pymysql.connect(
#     host='172.18.0.5',  # IP address of MySQL container
#     user='user',
#     password='password',
#     database='churn_db'
# )
# cursor = connection.cursor()
print('Connection Succesful my N*gga!')

Connection Succesful my N*gga!


In [13]:
# # Insert the Predictions Into the churn_predictions table
# for index, row in output.iterrows():
#     sql = """
#     UPDATE churn_predictions
#     SET ChurnPrediction = %s
#     WHERE customerID = %s
#     """
#     cursor.execute(sql, (row['ChurnPredictions'], row['customerID']))

# connection.commit()
# cursor.close()
# connection.close()
print('Succesful Inserted!!!')

Succesful Inserted!!!


******************************************************************************

In [None]:
# import pymysql
# import pandas as pd

# # Install pymysql if not already installed
# !pip install pymysql

# # Function to connect to the MySQL database
# def connect_to_db():
#     connection = pymysql.connect(
#         host='172.18.0.5',  # IP address of your MySQL container
#         user='user',
#         password='password',
#         database='churn_db'
#     )
#     return connection

# # Function to create or update the churn_predictions table
# def update_churn_predictions(df):
#     connection = connect_to_db()
#     cursor = connection.cursor()
    
#     # Create table if it doesn't exist
#     cursor.execute("""
#     CREATE TABLE IF NOT EXISTS churn_predictions (
#         customerID VARCHAR(50) PRIMARY KEY,
#         Gender VARCHAR(10),
#         SeniorCitizen BOOLEAN,
#         Partner VARCHAR(3),
#         Dependents VARCHAR(3),
#         Tenure INT,
#         PhoneService VARCHAR(3),
#         MultipleLines VARCHAR(20),
#         InternetService VARCHAR(20),
#         OnlineSecurity VARCHAR(3),
#         OnlineBackup VARCHAR(3),
#         DeviceProtection VARCHAR(3),
#         TechSupport VARCHAR(3),
#         StreamingTV VARCHAR(3),
#         StreamingMovies VARCHAR(3),
#         Contract VARCHAR(20),
#         PaperlessBilling BOOLEAN,
#         PaymentMethod VARCHAR(50),
#         MonthlyCharges DECIMAL(10, 2),
#         TotalCharges DECIMAL(10, 2),
#         ChurnPrediction VARCHAR(3)
#     )
#     """)
    
#     # Update table with predictions
#     for index, row in df.iterrows():
#         sql = """
#         INSERT INTO churn_predictions (customerID, ChurnPrediction)
#         VALUES (%s, %s)
#         ON DUPLICATE KEY UPDATE ChurnPrediction = VALUES(ChurnPrediction)
#         """
#         cursor.execute(sql, (row['customerID'], row['ChurnPredictions']))
    
#     connection.commit()
#     cursor.close()
#     connection.close()

# # Function to create or update the evaluation table
# def update_evaluation_table(df):
#     connection = connect_to_db()
#     cursor = connection.cursor()
    
#     # Create table if it doesn't exist
#     cursor.execute("""
#     CREATE TABLE IF NOT EXISTS evaluation (
#         Set VARCHAR(20),
#         Metric VARCHAR(20),
#         Model VARCHAR(20),
#         Score DECIMAL(10, 2)
#     )
#     """)
    
#     # Insert or update evaluation data
#     for index, row in df.iterrows():
#         sql = """
#         INSERT INTO evaluation (Set, Metric, Model, Score)
#         VALUES (%s, %s, %s, %s)
#         ON DUPLICATE KEY UPDATE Score = VALUES(Score)
#         """
#         cursor.execute(sql, (row['Set'], row['Metric'], row['Model'], row['Score']))
    
#     connection.commit()
#     cursor.close()
#     connection.close()

# # Prepare the churn predictions DataFrame
# classifier_predictions = best_classifier.predict(test)
# churn_predictions_df = pd.DataFrame({'customerID': test_ids, 'ChurnPredictions': classifier_predictions})

# # Update the churn_predictions table
# update_churn_predictions(churn_predictions_df)

# # Prepare the evaluation DataFrame
# evaluation_df = pd.DataFrame({
#     'Set': ['Training', 'Validation'],
#     'Metric': ['Accuracy', 'Accuracy'],
#     'Model': ['Classifier', 'Classifier'],
#     'Score': [train_accuracy, val_accuracy]
# })

# # Update the evaluation table
# update_evaluation_table(evaluation_df)

# # Display the evaluation DataFrame
# evaluation_df