In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns


1. Business Understanding
Define Stakeholder and Business Problem: Identify SyriaTel as the stakeholder and the problem of predicting customer churn.
Justify why classification is suitable for the problem context: Explain why predicting customer churn is a classification problem.


2. Data Understanding
Choose a Dataset and Explain the Stakeholder Audience: Introduce the dataset and explain that the audience is the telecom business.
Explore and Describe the Dataset: Analyze the features, distribution, and any initial insights.

3. Data Preparation
Handle Missing Data: Address any missing values in the dataset.
Deal with Non-numeric Data: Convert categorical data into numeric format.
Prevent Data Leakage: Ensure proper separation of training and testing data.
Scale Data (if applicable): If using distance-based models, scale the data.
Feature Engineering (optional): Create new features if needed.

## Data Preparation and Preprocessing

In [None]:
# Load the dataset
file_path = "path/to/your/file/bigml_59c28831336c6604c800002a.csv"
df = pd.read_csv(file_path)

In [None]:
# Explore the dataset
df.head()

In [None]:
# Handle Missing Data
df = df.dropna() 

In [None]:
# Deal with Non-numeric Data
label_encoder = LabelEncoder()
df['International plan'] = label_encoder.fit_transform(df['International plan'])
df['Voice mail plan'] = label_encoder.fit_transform(df['Voice mail plan'])
df['Churn'] = df['Churn'].astype(int)

In [None]:
# Drop non-predictive columns
df = df.drop(['State', 'Phone number'], axis=1)

In [None]:

# Split the data into training and testing sets
X = df.drop('Churn', axis=1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Building Models



# 1. Logistic Regression


In [None]:
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train, y_train)
y_pred_log_reg = log_reg_model.predict(X_test)

print("Logistic Regression Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_logreg)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_logreg))


# 2. Random Forest


In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("\nRandom Forest Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))


# 3. Support Vector Machine (SVM)


In [None]:
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("\nSupport Vector Machine Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))


# 4. K-Nearest Neighbors (KNN)


In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

print("\nK-Nearest Neighbors Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))


# 5. Decision Tree


In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print("\nDecision Tree Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))

# Evaluate Models


In [None]:
models = ['Logistic Regression','Random Forest', 'SVM', 'KNN', 'Decision Tree']
predictions = [y_pred_log_reg, y_pred_rf, y_pred_svm, y_pred_knn, y_pred_dt]

# 1. Confusion Matrix Comparison


In [None]:
plt.figure(figsize=(12, 8))
for i in range(len(models)):
    plt.subplot(2, 2, i+1)
    cm = confusion_matrix(y_test, predictions[i])
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title(f'Confusion Matrix - {models[i]}')

plt.tight_layout()
plt.show()

# 2. ROC Curve Comparison


In [None]:
plt.figure(figsize=(8, 6))
for i in range(len(models)):
    fpr, tpr, thresholds = roc_curve(y_test, predictions[i].predict_proba(X_test)[:,1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{models[i]} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc="lower right")
plt.show()

# 3. Feature Importance Comparison


In [None]:
plt.figure(figsize=(10, 6))
for i in range(len(models)):
    if 'Random Forest' in models[i]:  
        feature_importances = pd.Series(models[i].feature_importances_, index=X.columns)
        feature_importances.nlargest(10).plot(kind='barh', label=models[i])

plt.title('Top 10 Feature Importance Comparison - Random Forest')
plt.legend()
plt.show()

# 4. Precision-Recall Curve Comparison


In [None]:
plt.figure(figsize=(8, 6))
for i in range(len(models)):
    precision, recall, _ = precision_recall_curve(y_test, predictions[i].predict_proba(X_test)[:,1])
    plt.plot(recall, precision, label=models[i])

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve Comparison')
plt.legend()
plt.show()

# 5. Model Comparison - Accuracy


In [None]:
accuracies = [accuracy_score(y_test, pred) for pred in predictions]

plt.figure(figsize=(12, 6))
plt.bar(models, accuracies, color=['blue', 'green', 'red', 'purple', 'orange'])
plt.ylabel('Accuracy')
plt.title('Model Comparison - Accuracy')
plt.show()

In [None]:

# Save the model
import joblib
joblib.dump(classifier, 'churn_classifier_model.pkl')
