In [None]:
#1. Import Libraries and Load Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv('https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/002/492/original/ola_driver_scaler.csv')

# Check dataset structure
print(df.head())
print(df.info())


In [None]:
# Rename the columns
df.rename(columns={
    'MMM-YY': 'Reporting Date',
    'Dateofjoining': 'Date of Joining',
    'LastWorkingDate': 'Last Working Date'
}, inplace=True)

# Convert the columns to datetime
df['Reporting Date'] = pd.to_datetime(df['Reporting Date'], format='%d/%m/%y')
df['Date of Joining'] = pd.to_datetime(df['Date of Joining'], format='%d/%m/%y')
df['Last Working Date'] = pd.to_datetime(df['Last Working Date'], format='%d/%m/%y')



In [None]:
# Aggregating the data by Driver_ID
df_agg = df.groupby('Driver_ID').agg({
    'Age': 'max',
    'Gender': 'max',
    'City': 'first',
    'Education_Level': 'first',
    'Income': 'mean',
    'Date of Joining': 'first',
    'Last Working Date': 'first',
    'Joining Designation': 'first',
    'Grade': 'mean',
    'Total Business Value': 'sum',
    'Quarterly Rating': 'mean'
}).reset_index()

# Check the aggregated DataFrame
print(df_agg.head())



In [None]:
# Step 2: KNN Imputation
# First, create a subset of numerical columns for KNN imputation
numerical_columns = ['Age', 'Income', 'Total Business Value', 'Quarterly Rating']
knn_imputer = KNNImputer(n_neighbors=5)
df[numerical_columns] = knn_imputer.fit_transform(df[numerical_columns])

# Step 3: Feature Engineering
# Quarterly Rating Increase
df['Quarterly_Rating_Change'] = df.groupby('Driver_ID')['Quarterly Rating'].diff().fillna(0)
df['Rating_Increase'] = np.where(df['Quarterly_Rating_Change'] > 0, 1, 0)

# Target variable creation (1 if driver left the company, 0 otherwise)
df['Target'] = np.where(df['Last Working Date'].notna(), 1, 0)

# Income Increase
df['Income_Change'] = df.groupby('Driver_ID')['Income'].diff().fillna(0)
df['Income_Increase'] = np.where(df['Income_Change'] > 0, 1, 0)

# Tenure calculation (difference in days)
df['Tenure'] = (df['Last Working Date'] - df['Date of Joining']).dt.days
df['Tenure'].fillna((df['Reporting Date'] - df['Date of Joining']).dt.days, inplace=True)

# Business Value Per Day
df['Business_Value_Per_Day'] = df['Total Business Value'] / df['Reporting Date'].dt.days_in_month

# Step 4: Class Imbalance Treatment using SMOTE
# First, split data into features and target
X = df.drop(columns=['Target', 'Last Working Date', 'Date of Joining', 'Reporting Date', 'Driver_ID'])
y = df['Target']

# Encoding categorical variables using One-Hot Encoding
X = pd.get_dummies(X, columns=['Gender', 'City', 'Education_Level', 'Joining Designation', 'Grade'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Step 5: Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

# Step 6: Model Building using RandomForest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train_smote)

# Predictions
y_pred = rf_model.predict(X_test_scaled)
y_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

# Step 7: Evaluation
# Classification Report
print(classification_report(y_test, y_pred))

# ROC AUC Score
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC AUC Score: {roc_auc}')

# Plotting ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='orange', label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
# Create a column indicating whether the Quarterly Rating has increased
df['Quarterly_Rating_Change'] = df.groupby('Driver_ID')['Quarterly Rating'].diff().fillna(0).apply(lambda x: 1 if x > 0 else 0)

# Create a column indicating whether the Monthly Income has increased
df['Income_Change'] = df.groupby('Driver_ID')['Income'].diff().fillna(0).apply(lambda x: 1 if x > 0 else 0)

# Create target variable (1 if driver has left, 0 otherwise)
df['Target'] = df['Last Working Date'].apply(lambda x: 1 if pd.notnull(x) else 0)

# Aggregating the new features by Driver_ID
df_features = df.groupby('Driver_ID').agg({
    'Quarterly_Rating_Change': 'max',
    'Income_Change': 'max',
    'Target': 'max'
}).reset_index()

# Merge the new features with the aggregated data
df_agg = df_agg.merge(df_features, on='Driver_ID', how='left')

# Check the final aggregated DataFrame
print(df_agg.head())


In [None]:
# Univariate Analysis
sns.histplot(df_agg['Age'], kde=True)
plt.show()

sns.countplot(x='Gender', data=df_agg)
plt.show()

# Bivariate Analysis
if 'Target' not in df_agg.columns:
    df_features = df.groupby('Driver_ID').agg({
        'Quarterly_Rating_Change': 'max',
        'Income_Change': 'max',
        'Target': 'max'
    }).reset_index()
    df_agg = df_agg.merge(df_features[['Driver_ID', 'Target']], on='Driver_ID', how='left')

sns.scatterplot(x='Age', y='Income', hue='Target', data=df_agg)
plt.show()

In [None]:
# Generate correlation matrix
plt.figure(figsize=(12,8))
# Selecting only numerical features for correlation analysis
numerical_data = df_agg.select_dtypes(include=np.number)
corr_matrix = numerical_data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
# One-Hot Encoding for categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = encoder.fit_transform(df_agg[['City', 'Education_Level']])

# Standardization of numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_agg[['Age', 'Income', 'Total Business Value', 'Grade']])

# Combine scaled numerical features and encoded categorical features
X = np.hstack((scaled_features, encoded_features))

# Addressing class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, df_agg['Target'])

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Bagging - Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Boosting - Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

# Hyperparameter tuning (example with Random Forest)
param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)


In [None]:
# Classification Report
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

# ROC AUC Score and Curve
rf_auc = roc_auc_score(y_test, y_pred_rf)
gb_auc = roc_auc_score(y_test, y_pred_gb)

fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
fpr_gb, tpr_gb, _ = roc_curve(y_test, y_pred_gb)

plt.figure(figsize=(10, 6))
plt.plot(fpr_rf, tpr_rf, label=f"Random Forest (AUC = {rf_auc:.2f})")
plt.plot(fpr_gb, tpr_gb, label=f"Gradient Boosting (AUC = {gb_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='best')
plt.show()


# **ROC Curve Analysis:**

**Random Forest:**   
AUC = 0.78

* This indicates a better overall performance compared to Gradient Boosting.

* The curve is closer to the top-left corner, meaning a better trade-off between true positive and false positive rates.

**Gradient Boosting:**

AUC = 0.73


* Slightly lower AUC compared to Random Forest.

* The curve is less optimal, indicating a poorer trade-off compared to Random Forest.





# **Classification Report Summary**:

**Random Forest:**

* Precision: 0.79 for class 0, 0.77 for class 1.

* Recall: 0.76 for class 0, 0.79 for class 1.

* F1-Score: 0.78 for both classes.

**Gradient Boosting:**

* Precision: 0.75 for class 0, 0.71 for class 1.

* Recall: 0.68 for class 0, 0.77 for class 1.

* F1-Score: 0.72 for class 0, 0.74 for class 1.


#  **Actionable Insights & Recommendations**
1. Model Performance Comparison
* Random Forest outperforms Gradient Boosting in this task, as indicated by its higher AUC (0.78 vs. 0.73). The Random Forest model shows a better trade-off between true positive and false positive rates, making it a more reliable model for predicting driver attrition.

* Precision and recall metrics suggest that Random Forest is slightly more balanced in handling both classes (drivers who stay and those who leave) compared to Gradient Boosting. The F1-Score is equal for both classes in Random Forest, showing consistency in performance.
2. Model Selection Recommendation

* Given the performance metrics, Random Forest should be the primary model used for predicting driver attrition. It provides a better balance between correctly identifying drivers who will leave (class 1) and those who will stay (class 0).
3. Targeted Retention Strategies

* **Identify at-risk drivers:** Utilize the predictions from the Random Forest model to identify drivers at high risk of leaving. Focus retention strategies on these drivers.
* **Personalized incentives:** For drivers predicted to leave, consider personalized incentives such as bonuses, flexible working hours, or tailored engagement programs to improve their satisfaction and reduce attrition.
4. Enhancing Model Features

* **Quarterly Rating Improvement:** Since the quarterly rating is an important feature, focus on interventions that can improve driver ratings. For example, providing additional training or resources to help drivers perform better could lead to higher retention rates.
* **Income Monitoring:** Continuously monitor the income trends of drivers. Drivers with declining income might be more likely to leave. Creating programs to stabilize or increase income for these drivers could reduce attrition.
5. Operational Insights

*  **City-Specific Retention Programs:** If the analysis shows city-specific differences in attrition rates, tailor retention programs to address the unique challenges faced by drivers in those cities.

* **Demographic Tailoring:** Analyze demographic factors like age and education level to identify segments that might be more prone to leaving. Tailor engagement and retention strategies accordingly.
6. Further Model Improvements
* **Hyperparameter Tuning:** While Random Forest is performing well, there might be further room for improvement through hyperparameter tuning. This can potentially enhance model accuracy and the ability to generalize to unseen data.
* **Ensemble Methods:** Explore stacking or other advanced ensemble methods that combine Random Forest and Gradient Boosting, potentially leveraging the strengths of both models.
7. Driver Engagement
* **Feedback Loop:** Implement a feedback mechanism where drivers can voice their concerns or suggestions. This data can be used to enhance the model and improve retention strategies.

* **Regular Monitoring:** Establish a regular monitoring system using the model predictions to track and address attrition risks before they materialize.

In [None]:
# Save feature names after preprocessing
feature_columns = df_agg.columns.tolist()  # Use X from your training data

# Create input widgets
import ipywidgets as widgets
from IPython.display import display
from datetime import date

# Widgets for features with city names
city_mapping = [
    ('Pune', 'C23'),
    ('Mumbai', 'C7'),
    ('Varanasi', 'C13'),
    ('Goa', 'C9'),
    ('Dubai', 'C11')
]

age = widgets.FloatText(description="Age:", value=30)
gender = widgets.Dropdown(options=[('Female', 0), ('Male', 1)], description="Gender:", value=0)
city = widgets.Dropdown(options=city_mapping, description="City:", value='C23')
education = widgets.Dropdown(options=[('High School', 0), ('Bachelor', 1), ('Master', 2)],
                           description="Education Level:", value=1)
income = widgets.FloatText(description="Income:", value=50000)
joining_designation = widgets.Dropdown(options=[('Junior', 1), ('Mid', 2), ('Senior', 3)],
                                     description="Joining Designation:", value=2)
grade = widgets.Dropdown(options=[('I', 1), ('II', 2), ('III', 3)], description="Grade:", value=2)
total_business_value = widgets.FloatText(description="Business Value:", value=100000)
quarterly_rating = widgets.FloatText(description="Quarterly Rating:", value=3.0)
date_of_joining = widgets.DatePicker(description="Joining Date:", value=date(2020, 1, 1))
reporting_date = widgets.DatePicker(description="Reporting Date:", value=date(2023, 1, 1))

# Prediction button and output
button = widgets.Button(description="Predict", button_style='success')
output = widgets.Output()

# Display widgets
display(widgets.VBox([
    widgets.HBox([age, gender]),
    widgets.HBox([city, education]),
    widgets.HBox([income, total_business_value]),
    widgets.HBox([joining_designation, grade]),
    quarterly_rating,
    widgets.HBox([date_of_joining, reporting_date]),
    button,
    output
]))

def on_button_click(b):
    with output:
        output.clear_output()
        try:
            # Calculate derived features
            tenure = (reporting_date.value - date_of_joining.value).days
            business_value_per_day = total_business_value.value / reporting_date.value.day

            # Create input DataFrame
            input_data = pd.DataFrame([{
                'Age': age.value,
                'Gender': gender.value,
                'City': city.value,
                'Education_Level': education.value,
                'Income': income.value,
                'Joining Designation': joining_designation.value,
                'Grade': grade.value,
                'Total Business Value': total_business_value.value,
                'Quarterly Rating': quarterly_rating.value,
                'Tenure': tenure,
                'Business_Value_Per_Day': business_value_per_day,
                'Quarterly_Rating_Change': 0,  # Default value
                'Income_Change': 0             # Default value
            }])

            # Preprocess input
            input_scaled = preprocess_input(input_data, feature_columns, scaler)

            # Make prediction
            proba = rf_model.predict_proba(input_scaled)[0][1]
            prediction = "High Risk" if proba >= 0.5 else "Low Risk"

            # Display results
            print("🚨 Driver Attrition Prediction Report 🚨")
            print("="*45)
            print(f"Prediction: {prediction} ({proba*100:.1f}% probability)")
            print("\n🔍 Key Risk Factors:")
            print(f"- City: {dict(city_mapping)[city.value]} (Code: {city.value})")
            print(f"- Quarterly Rating: {quarterly_rating.value}/5")
            print(f"- Tenure: {tenure} days ({tenure//365} years)")
            print(f"- Income: ₹{income.value:,.0f}")
            print(f"- Education Level: {dict(education.options)[education.value]}")
            print(f"- Business Value/Day: ₹{business_value_per_day:,.0f}")

        except Exception as e:
            print(f"❌ Error: {str(e)}")

# Connect button to function
button.on_click(on_button_click)