In [10]:
#  import necessary modules & libraries
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [11]:
# read data from file
df = pd.read_csv("../cleaned_df.csv", index_col=0)

In [12]:
df.head(2)

Unnamed: 0,available_extra_rooms_in_hospital,bed_grade,severity_of_illness,visitors_with_patient,admission_deposit,hospital_code_1,hospital_code_2,hospital_code_3,hospital_code_4,hospital_code_5,...,age_11-20,age_21-30,age_31-40,age_41-50,age_51-60,age_61-70,age_71-80,age_81-90,age_91-100,stay
0,3,1,0,2,4911.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0-10
1,2,1,0,2,5954.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,41-50


SVM Algorithm takes a really long time to train on a big dataset; therefore, I would reduce the size but maintain the proportion of classes in the target variable.

In [13]:
# Define the sample size
sample_size = 10000

# Define the desired sample size
sample_size = 10000

# Get the class distribution
class_counts = df["stay"].value_counts().sort_values(ascending=False)

# Calculate the sample size for each stratum (proportional to class distribution)
sample_size_per_class = (sample_size * class_counts / class_counts.sum()).astype(int)

# Create a stratified sample
stratified_sample = pd.DataFrame()
for class_label, sample_count in sample_size_per_class.items():
# Check if there are enough elements in the class to avoid sampling errors
  if sample_count > 0:
    stratified_sample = pd.concat([stratified_sample, df[df["stay"] == class_label].sample(sample_count)])
 
stratified_sample.head(2)



Unnamed: 0,available_extra_rooms_in_hospital,bed_grade,severity_of_illness,visitors_with_patient,admission_deposit,hospital_code_1,hospital_code_2,hospital_code_3,hospital_code_4,hospital_code_5,...,age_11-20,age_21-30,age_31-40,age_41-50,age_51-60,age_61-70,age_71-80,age_81-90,age_91-100,stay
307063,3,1,1,2,5648.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21-30
119059,4,1,1,2,5847.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,21-30


In [14]:
X = stratified_sample.drop(columns=["stay"])
y = stratified_sample["stay"]

In [15]:


# split data with standard 80%/10%/10% split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


In [16]:
# Check the shape of X_train, X_val and X_test
print(X_train.shape, X_val.shape, X_test.shape)

(7995, 125) (999, 125) (1000, 125)


## Model Development

In [17]:
# instantiate classifier with default hyperparameters
# default hyperparameters are: kernel='rbf', C=1 and gamma='scale'
svc=SVC(random_state=42)

# fit classifier to training set
svc.fit(X_train, y_train)

# make predictions on validation set
y_pred = svc.predict(X_val)

# calculate and print the accuracy
accuracy = accuracy_score(y_val, y_pred)
print('Accuracy: ',accuracy)


Accuracy:  0.2972972972972973


The accuracy above is produced with default hyperparameters which are kernel='rbf', C=1 and gamma='scale'. Due to time constraint, it is impossible to Gridsearch with a variety of other values. 

My goal is to develop a training model that produces better accuracy with time constraint. Therefore, I will use my judgement to choose the appropriate hyperparameters.

In [18]:
param_grid = {"C": [0.1, 1, 10, 100],
            "gamma": [1, 0.1, 0.01, 0.001],
            "kernel": ["linear", "rbf", "poly"],
            "degree": [2, 3, 4, 5],
            "decision_function_shape": ["ovo", "ovr"]}

In [22]:
# Experiment with different values of the C parameter; 
# try the linear, rbf (with different choices of gamma) 
# and polynomial kernels (with different degrees);
# try both options for decision_function_shape

model_1 = SVC(random_state=42, kernel="linear", decision_function_shape="ovr")
model_2 = SVC(random_state=42, kernel="poly", degree=3, decision_function_shape="ovr")

# model_3 = SVC(random_state=42, kernel="rbf", gamma=0.1, decision_function_shape="ovr")
# # print 2 best hyperparameters
# print(grid_search.best_params_)
# print(grid_search.best_score_)



In [23]:
model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)

In [20]:
# Use classification_report and confusion_matrix to evaluate the performance of the model

In [21]:
# display_tree in utils.py to display each tree

## Conclusion

Discuss the last hyperparameters based on the training & validation metrics.

## Comparison

Provide a written comparison of the training & validation metrics for SVM & decision tree