In [None]:
# import statements at the top for readability
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('nhis_2022.csv')
data.head()

In [None]:
data.columns

In [None]:
data.shape

## Data Preprocessing

In [None]:
# variables i want to use
# i want three models where
# target = cancer ever (CANCEREV)
# model 1: linear svm --> demo/health
# model 2: radial svm --> activity/alc/smoke
# model 3: poly svm --> food
target = 'CANCEREV'
demo_health = ['AGE', 'SEX', 'BMICALC', 'EDUC', 'HINOTCOVE']
activity = ['ALCANYNO', 'CIGDAYMO', 'MOD10DMIN', 'VIG10DMIN', 'HRSLEEP', 'HOURSWRK']
food = ['FRUTNO', 'VEGENO', 'JUICEMNO', 'SALADSNO', 'BEANNO', 'SALSAMNO', 'TOMSAUCEMNO', 'SODAPNO', 'FRIESPNO', 'SPORDRMNO', 'FRTDRINKMNO', 'COFETEAMNO', 'POTATONO', 'PIZZANO']
data = data[[target] + demo_health + activity + food]
data.columns

In [None]:
data.isna().sum()
# no missing values. interesting

In [None]:
for col in data.columns:
    print(data[col].value_counts())

In [None]:
# clean/filter the invalid values like 999, 998, etc.
data = data.replace([999, 998, 997, 996], np.nan)
data = data[data['CANCEREV'].isin([1, 2])]
data = data[data['SEX'].isin([1, 2])]
data = data[data['HINOTCOVE'].isin([1, 2])]
data = data.dropna()
data.head()

In [None]:
for col in data.columns:
    print(data[col].value_counts())

## Modeling - Linear SVM

In [None]:
X_linear = data[demo_health]
y_linear = data[target]

scaler = StandardScaler()
X_linear_scaled = scaler.fit_transform(X_linear)

x_train, x_test, y_train, y_test = train_test_split(X_linear_scaled, y_linear, test_size = 0.2, random_state = 5322)

In [None]:
param_grid = {'C': [0.1, 1, 10]}
model = SVC(kernel = 'linear', class_weight = 'balanced', random_state = 5322)

linear_grid = GridSearchCV(
    estimator = model,
    param_grid = param_grid,
    scoring = 'f1_weighted',
    cv = 5,
    n_jobs = -1,
    verbose = 1
)
linear_grid.fit(x_train, y_train)

print(linear_grid.best_params_)
print(linear_grid.best_score_)
print(classification_report(y_test, linear_grid.predict(x_test), zero_division = 0))

Some things to note:

Precision is high for non-cancer individuals, but recall is low\
Precision is low for cancer individuals, but recall is high\
Clear class imbalance: a lot more non-cancer individuals in our dataset versus cancer individuals\

Accuracy is not that good either. 70% train, 64% test.

## Modeling - Radial

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': [0.1, 1, 10]
}

model = SVC(kernel = 'rbf', class_weight = 'balanced', random_state = 5322)

radial_grid = GridSearchCV(
    estimator = model,
    param_grid = param_grid,
    scoring = 'f1_weighted',
    cv = 5,
    n_jobs = -1,
    verbose = 1
)

radial_grid.fit(x_train, y_train)

print(radial_grid.best_params_)
print(radial_grid.best_score_)
print(classification_report(y_test, radial_grid.predict(x_test), zero_division = 0))

Some things to note:

Radial model did better than linear\
We did use different variables, so maybe activity, alcohol, and/or smoking has more of an influence on cancer (not surprising/pretty intuitive)\
Precision is again high for non-cancer individuals, and recall was a bit higher, but still a little low\
Precision is still low for cancer individuals, but recall is high\
Clear class imbalance: a lot more non-cancer individuals in our dataset versus cancer individuals\

Accuracy is ~ 5% better than linear. 75% train, 70% test.

## Modeling - Polynomial SVM

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10],
    'degree': [2, 3]
}

model = SVC(kernel = 'poly', class_weight = 'balanced', random_state = 5322)

poly_grid = GridSearchCV(
    estimator = model,
    param_grid = param_grid,
    scoring = 'f1_weighted',
    cv = 5,
    n_jobs = -1,
    verbose = 1
)

poly_grid.fit(x_train, y_train)

print(poly_grid.best_params_)
print(poly_grid.best_score_)
print(classification_report(y_test, poly_grid.predict(x_test), zero_division = 0))

Poly is taking over 40minutes, so i just stopped it. might need to change this somehow...