In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB

# Load the dataset
data = pd.read_csv(r'/Users/aceboogie/Desktop/churn_clean.csv')

# Step 1: Select relevant features
selected_features = ['Age', 'Income', 'Marital', 'Gender', 'Contract', 'Techie',
                     'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                     'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
                     'PaymentMethod', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'Churn']
data = data[selected_features]

# Step 2: Encode categorical variables using one-hot encoding
encoded_data = pd.get_dummies(data, columns=['Marital', 'Gender', 'Contract', 'Techie', 'InternetService',
                                            'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                                            'TechSupport', 'StreamingTV', 'StreamingMovies',
                                            'PaperlessBilling', 'PaymentMethod'])

# Step 3: Scale continuous variables using MinMaxScaler
scaler = MinMaxScaler()
continuous_features = ['Age', 'Income', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year']
encoded_data[continuous_features] = scaler.fit_transform(encoded_data[continuous_features])

# Save the preprocessed data to an Excel file
encoded_data.to_excel('preprocessed_data.xlsx', index=False)

# Step 4: Split the dataset into training and testing sets
X = encoded_data.drop('Churn', axis=1)
y = encoded_data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train the Naive Bayes classifier and make predictions
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv(r'/Users/aceboogie/Desktop/churn_clean.csv')

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Save the training and test sets as separate CSV files
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB

# Load the dataset
data = pd.read_csv(r'/Users/aceboogie/Desktop/churn_clean.csv')

# Step 1: Select relevant features
selected_features = ['Age', 'Income', 'Marital', 'Gender', 'Contract', 'Techie',
                     'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                     'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
                     'PaymentMethod', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'Churn']
data = data[selected_features]

# Step 2: Encode categorical variables using one-hot encoding
encoded_data = pd.get_dummies(data, columns=['Marital', 'Gender', 'Contract', 'Techie', 'InternetService',
                                            'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                                            'TechSupport', 'StreamingTV', 'StreamingMovies',
                                            'PaperlessBilling', 'PaymentMethod'])

print("Encoded Data:\n", encoded_data.head())

# Step 3: Scale continuous variables using MinMaxScaler
scaler = MinMaxScaler()
continuous_features = ['Age', 'Income', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year']
encoded_data[continuous_features] = scaler.fit_transform(encoded_data[continuous_features])

print("\nScaled Data:\n", encoded_data.head())

# Step 4: Split the dataset into training and testing sets
X = encoded_data.drop('Churn', axis=1)
y = encoded_data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nX_train:\n", X_train.head())
print("\nX_test:\n", X_test.head())
print("\ny_train:\n", y_train.head())
print("\ny_test:\n", y_test.head())

# Step 5: Train the Naive Bayes classifier and make predictions
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("\nPredicted Churn (y_pred):\n", y_pred[:10])


Encoded Data:
    Age    Income     Tenure  MonthlyCharge  Bandwidth_GB_Year Churn  \
0   68  28561.99   6.795513     172.455519         904.536110    No   
1   27  21704.77   1.156681     242.632554         800.982766   Yes   
2   50   9609.57  15.754144     159.947583        2054.706961    No   
3   48  18925.23  17.087227     119.956840        2164.579412    No   
4   83  40074.19   1.670972     149.948316         271.493436   Yes   

   Marital_Divorced  Marital_Married  Marital_Never Married  \
0                 0                0                      0   
1                 0                1                      0   
2                 0                0                      0   
3                 0                1                      0   
4                 0                0                      0   

   Marital_Separated  ...  StreamingTV_No  StreamingTV_Yes  \
0                  0  ...               1                0   
1                  0  ...               0              

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, roc_auc_score

# Load the dataset
data = pd.read_csv(r'/Users/aceboogie/Desktop/churn_clean.csv')

# Step 1: Select relevant features
selected_features = ['Age', 'Income', 'Marital', 'Gender', 'Contract', 'Techie',
                      'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                     'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
                     'PaymentMethod', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'Churn']
data = data[selected_features]

# Convert 'Churn' column to numeric values
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

# Identify columns with missing or infinite values
print("Columns with missing or infinite values:")
for col in data.columns:
    if data[col].isnull().any() or data[col].isin([float("inf"), float("-inf")]).any():
        print(col)

# Handle missing values by filling them with the median of the respective column
data.fillna(data.median(), inplace=True)

# Check for any remaining missing or infinite values
print("Columns with remaining missing or infinite values:")
for col in data.columns:
    if data[col].isnull().any() or data[col].isin([float("inf"), float("-inf")]).any():
        print(col)

# Step 2: Encode categorical variables using one-hot encoding
encoded_data = pd.get_dummies(data, columns=['Marital', 'Gender', 'Contract', 'Techie', 
                                            'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                                            'TechSupport', 'StreamingTV', 'StreamingMovies',
                                            'PaperlessBilling', 'PaymentMethod'])

# Step 3: Scale continuous variables using MinMaxScaler
scaler = MinMaxScaler()
continuous_features = ['Age', 'Income', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year']
encoded_data[continuous_features] = scaler.fit_transform(encoded_data[continuous_features])

# Step 4: Split the dataset into training and testing sets
X = encoded_data.drop('Churn', axis=1)
y = encoded_data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train the Naive Bayes classifier and make predictions
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# Calculate the accuracy and AUC
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")







Columns with missing or infinite values:
Columns with remaining missing or infinite values:
Accuracy: 0.8805
AUC: 0.8534461861667744


  data.fillna(data.median(), inplace=True)
