In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

import pickle

In [2]:
churn_df = pd.read_csv("../data/task_data_training.csv")


In [3]:
churn_df.head()

Unnamed: 0.1,Unnamed: 0,value_number_of_active_months,revenue,value_days_to_purchase,action_create_project,value_transactions_number,ws_users_activated,action_export_report,action_create_invoice,value_regular_seats,action_project_budget,action_time_entries_via_tracker,action_screenshots,action_create_expense,action_lock_entries,ws_users_deactivated,action_start_trial,churned_status
0,0,0,184.925,2,5,6,3,8,0,3,0,0,0.0,0,0,2,0,No
1,1,3,395.122,98,3,12,2,3,0,3,9,0,1.0,0,10,0,0,No
2,2,1,25.974,0,0,2,1,0,0,1,0,0,1.0,0,0,0,0,Yes
3,3,1,406.068,53,3,12,3,0,0,3,0,0,1.0,0,0,0,0,No
4,4,2,25.974,1,5,2,1,8,6,1,0,0,0.0,0,0,0,0,No


In [4]:
X = churn_df.drop(['churned_status'], axis=1)
y = churn_df['churned_status']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Define the hyperparameter grid to search
param_grid = {
    'C': [0.1, 1, 10],                # Regularization parameter
    'kernel': ['linear', 'sigmoid'],      # Kernel type
}

# Create an SVM classifier
svm_classifier = SVC(random_state=42)

# Create a GridSearchCV object to perform hyperparameter tuning
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, scoring='balanced_accuracy', cv=5)
grid_search.fit(X_train, y_train)

# Using the best params to create final classifier
best_params = grid_search.best_params_
best_svm_classifier = SVC(random_state=42, **best_params)
best_svm_classifier.fit(X_train, y_train)

# Measuring model performance
y_pred_svm = best_svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
confusion_svm = confusion_matrix(y_test, y_pred_svm)
classification_rep_svm = classification_report(y_test, y_pred_svm)

print("Best SVM Model after Hyperparameter Tuning:")
print(f"Best Hyperparameters: {best_params}")
print(f"Accuracy: {accuracy_svm}")
print(f"Confusion Matrix:\n{confusion_svm}")
print(f"Classification Report:\n{classification_rep_svm}")

Best SVM Model after Hyperparameter Tuning:
Best Hyperparameters: {'C': 0.1, 'kernel': 'linear'}
Accuracy: 0.8833819241982507
Confusion Matrix:
[[202  18]
 [ 22 101]]
Classification Report:
              precision    recall  f1-score   support

          No       0.90      0.92      0.91       220
         Yes       0.85      0.82      0.83       123

    accuracy                           0.88       343
   macro avg       0.88      0.87      0.87       343
weighted avg       0.88      0.88      0.88       343



As we can see here there is lot less FP and FN compared to TP and TN. In the future work, we would like to reduce this rate of FN even more. Because, if the model says that someone is negative and is actually positive, company would lose a client without even trying to prevent that.


### Exporting the best model along with model features

In [10]:
with open('../artifacts/churn_model.pkl', 'wb') as model_file:
    pickle.dump(best_svm_classifier, model_file)

In [11]:
with open('../artifacts/churn_model_features.pkl', 'wb') as model_file:
    feature_names = list(best_svm_classifier.feature_names_in_)
    pickle.dump(feature_names, model_file)
