In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import RandomOverSampler

In [2]:
churn_df = pd.read_csv("../data/task_data_training.csv")


In [3]:
churn_df.head()

Unnamed: 0,value_number_of_active_months,revenue,value_days_to_purchase,action_create_project,value_transactions_number,ws_users_activated,action_export_report,action_create_invoice,value_regular_seats,action_project_budget,action_time_entries_via_tracker,action_screenshots,churned_status
0,0,184.925,2,5,6,3,8,0,3,0,0,0.0,No
1,3,395.122,98,3,12,2,3,0,3,9,0,1.0,No
2,1,25.974,0,0,2,1,0,0,1,0,0,1.0,Yes
3,1,406.068,53,3,12,3,0,0,3,0,0,1.0,No
4,2,25.974,1,5,2,1,8,6,1,0,0,0.0,No


In [4]:
X = churn_df.drop(['churned_status'], axis=1)
y = churn_df['churned_status']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Define the hyperparameter grid to search
param_grid = {
    'C': [0.1, 1, 10],                # Regularization parameter
    'kernel': ['linear', 'rbf', 'sigmoid'],      # Kernel type
    'gamma': ['scale', 'auto', 0.1],  # Kernel coefficient (only for 'rbf' kernel)
}

# Create an SVM classifier
svm_classifier = SVC(random_state=42)

# Create a GridSearchCV object to perform hyperparameter tuning
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, scoring='balanced_accuracy', cv=5)

# Fit the grid search to your data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Use the best hyperparameters to create a new SVM classifier
best_svm_classifier = SVC(random_state=42, **best_params)

# Fit the new classifier to the training data
best_svm_classifier.fit(X_train, y_train)

# Make predictions on the test data using the best model
y_pred_svm = best_svm_classifier.predict(X_test)

# Evaluate the best SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
confusion_svm = confusion_matrix(y_test, y_pred_svm)
classification_rep_svm = classification_report(y_test, y_pred_svm)

print("Best SVM Model after Hyperparameter Tuning:")
print(f"Best Hyperparameters: {best_params}")
print(f"Accuracy: {accuracy_svm}")
print(f"Confusion Matrix:\n{confusion_svm}")
print(f"Classification Report:\n{classification_rep_svm}")

Best SVM Model after Hyperparameter Tuning:
Best Hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy: 0.7288629737609329
Confusion Matrix:
[[173  47]
 [ 46  77]]
Classification Report:
              precision    recall  f1-score   support

          No       0.79      0.79      0.79       220
         Yes       0.62      0.63      0.62       123

    accuracy                           0.73       343
   macro avg       0.71      0.71      0.71       343
weighted avg       0.73      0.73      0.73       343



As we see here, our traditional fine tuning didn't work because we still have a lot of false positives and a lot of false negatives.  So will try now with over/undersampling techniques.