<a href="https://colab.research.google.com/github/CurtisLuu/RIT_Churn_Predictor/blob/main/week_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Load the one-hot encoded dataset with low_memory=False to prevent DtypeWarning
data_path = '/content/week3_Cleaned_OHE_Dataset.csv'  # Update this path if needed
df = pd.read_csv(data_path, low_memory=False)

# Check for and remove any rows with NaN in the target variable 'Churn'
df = df.dropna(subset=['Churn'])

# Define target variable and features
X = df.drop(columns=['Churn'])
y = df['Churn']

# Check for any date columns in X and convert or drop them
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            # Try to convert the column to datetime
            X[col] = pd.to_datetime(X[col])
            # Create separate year, month, and day columns from the datetime column
            X[f'{col}_year'] = X[col].dt.year
            X[f'{col}_month'] = X[col].dt.month
            X[f'{col}_day'] = X[col].dt.day
            # Drop the original datetime column
            X = X.drop(columns=[col])
        except:
            # If conversion to datetime fails, skip this column or handle it otherwise
            pass

# Ensure all data is numeric (if any remaining object types were not dates)
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the logistic regression model
log_reg = LogisticRegression(max_iter=1000, solver='liblinear')
log_reg.fit(X_train, y_train)

# Get coefficients and feature names
coefficients = log_reg.coef_[0]
feature_names = X.columns

# Create a DataFrame to hold features and their corresponding coefficients and odds ratios
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Odds Ratio': np.exp(coefficients),
    'Abs_Coefficient': np.abs(coefficients)  # Use absolute value for sorting
})

# Sort by absolute coefficient and select the top 50 features
top_50_features = coef_df.sort_values(by='Abs_Coefficient', ascending=False).head(50)

# Display the leaderboard of the top 50 factors influencing churn
print("Top 50 Factors Influencing Churn:")
print(top_50_features[['Feature', 'Coefficient', 'Odds Ratio']])

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Generate and display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Generate and display the classification report
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)


Top 50 Factors Influencing Churn:
                                                Feature  Coefficient  \
3735                  Status Description_Team Allocated    -4.815850   
3732                        Status Description_Rejected     4.611860   
3734                         Status Description_Started    -4.295401   
3731                     Status Description_Dropped Out     3.696315   
3737                        Status Description_Withdraw     1.826699   
3733                   Status Description_Rewards Award    -0.774099   
3492              Current/Intended Major_DATA ANALYTICS     0.270246   
3386            Current Student Status_Not in Education     0.258010   
1641                                      Country_INDIA     0.241886   
1681                                         Country_US     0.210678   
3025                    Institution Name_SRM UNIVERSITY     0.207881   
3591         Current/Intended Major_INFORMATION SYSTEMS     0.165646   
1660                          