Final Project - analyzing ML models

In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from pymongo import MongoClient
import joblib

# Connect to MongoDB and load data
client = MongoClient("mongodb://localhost:27017/")
db = client['BDA']
collection = db['Customers']

# Load data from MongoDB into a DataFrame
cursor = collection.find()
churn = pd.DataFrame(list(cursor))

# Drop MongoDB-specific ID column if present
if '_id' in churn.columns:
    churn = churn.drop(columns=['_id'])

# Normalize nested 'Services' field
services_churn = pd.json_normalize(churn['Services'])
churn = churn.drop(columns=['Services'])
churn = pd.concat([churn, services_churn], axis=1)

churn = churn.rename(columns=str.lower)

# Remove "id" from customerid column
churn['customerid'] = churn['customerid'].str[2:]

# Encode categorical variables
label_encoder_contract = LabelEncoder()
label_encoder_internetservice = LabelEncoder()
label_encoder_paymentmethod = LabelEncoder()

churn['contract_code'] = label_encoder_contract.fit_transform(churn['contract'])
churn['internetservice_code'] = label_encoder_internetservice.fit_transform(churn['internetservice'])
churn['paymentmethod_code'] = label_encoder_paymentmethod.fit_transform(churn['paymentmethod'])

churn['totalcharges'] = pd.to_numeric(churn['totalcharges'], errors='coerce')

def convert_yes_no_to_int(df, column_name):
    unique_values = set(value for sublist in df[column_name].dropna() for value in (sublist if isinstance(sublist, list) else [sublist]))
    mapping = {value: idx for idx, value in enumerate(unique_values)}
    df[column_name] = df[column_name].apply(lambda x: [mapping[item] for item in (x if isinstance(x, list) else [x])] if pd.notna(x) else [0])
    df[column_name] = df[column_name].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
    return df, mapping

columns_to_convert = [
    'partner', 'dependents', 'phoneservice', 'multiplelines',
    'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
    'streamingtv', 'streamingmovies', 'paperlessbilling', 'churn'
]

mapping_dict = {}
for column in columns_to_convert:
    if column in churn.columns:
        churn, mapping = convert_yes_no_to_int(churn, column)
        mapping_dict[column] = mapping

churn['gender'] = churn['gender'].map({'Male': 0, 'Female': 1}).astype(int)

# Prepare the DataFrame for prediction
df = churn.copy()
df = df.drop(columns=['contract', 'paymentmethod', 'internetservice'])

# Add a new column 'churn' with random values of 1 and 0
np.random.seed(42)  # Set seed for reproducibility
df['churn'] = np.random.randint(0, 2, size=len(df))

# Convert all columns to floats
df = df.astype(float)

# Ensure expected features are present
expected_features = [
    'gender', 'seniorcitizen', 'partner', 'dependents',
    'tenure', 'phoneservice', 'multiplelines', 'internetservice_code', 'onlinesecurity',
    'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
    'contract_code', 'paperlessbilling', 'paymentmethod_code', 'monthlycharges', 'totalcharges'
]

for feature in expected_features:
    if feature not in df.columns:
        print(f"Feature {feature} is missing from the data.")
        df[feature] = 0.0

df_filtered = df[expected_features]

# Impute missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df_filtered), columns=df_filtered.columns)

# Convert the DataFrame to a NumPy array for prediction
df_pred_array = df_imputed.to_numpy()

# Load the trained model
model = joblib.load(r'C:\Users\emmag\Python\Final Project\churn_knn_model.sav')

# Make predictions
predictions = model.predict(df_pred_array)

# Create a DataFrame with predictions
churn_depthtree_output = pd.DataFrame({
    'customerid': churn['customerid'].repeat(len(predictions) // len(churn['customerid'])),
    'actually_churned': churn['churn'].repeat(len(predictions) // len(churn['churn'])),
    'churned_predicted_by_model': predictions
})

# Display the resulting DataFrame
churn_depthtree_output

# Optionally, save the output to a CSV file
churn_depthtree_output.to_csv('churn_predictions.csv', index=False)


KeyError: 'churn'