In [1]:
# import statements at the top for readability
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('nhis_2022.csv')
data.head()

Unnamed: 0,YEAR,SERIAL,STRATA,PSU,NHISHID,REGION,PERNUM,NHISPID,HHX,SAMPWEIGHT,...,TOMSAUCEMNO,SODAPNO,FRIESPNO,SPORDRMNO,FRTDRINKMNO,COFETEAMNO,POTATONO,PIZZANO,HRSLEEP,CVDSHT
0,2022,1,143,16,0002022H000001,4,1,0002022H00000110,H000001,8018.0,...,2,0,110,3,0,0,3,2,8,1
1,2022,2,106,53,0002022H000003,3,1,0002022H00000310,H000003,10117.0,...,1,0,1,0,0,1,1,1,6,2
2,2022,2,106,53,0002022H000003,3,2,0002022H00000320,H000003,7933.0,...,996,996,996,996,996,996,996,996,0,2
3,2022,3,134,13,0002022H000006,2,1,0002022H00000610,H000006,2681.0,...,1,1,1,0,2,0,1,1,6,2
4,2022,4,106,53,0002022H000007,3,1,0002022H00000710,H000007,10233.0,...,3,30,5,1,0,30,6,2,8,2


In [3]:
data.columns

Index(['YEAR', 'SERIAL', 'STRATA', 'PSU', 'NHISHID', 'REGION', 'PERNUM',
       'NHISPID', 'HHX', 'SAMPWEIGHT', 'ASTATFLG', 'CSTATFLG', 'AGE', 'SEX',
       'MARSTCUR', 'EDUC', 'HOURSWRK', 'POVERTY', 'HEIGHT', 'WEIGHT',
       'BMICALC', 'HINOTCOVE', 'CANCEREV', 'CHEARTDIEV', 'DIABETICEV',
       'HEARTATTEV', 'STROKEV', 'ALCANYNO', 'ALCDAYSYR', 'CIGDAYMO',
       'MOD10DMIN', 'VIG10DMIN', 'FRUTNO', 'VEGENO', 'JUICEMNO', 'SALADSNO',
       'BEANNO', 'SALSAMNO', 'TOMSAUCEMNO', 'SODAPNO', 'FRIESPNO', 'SPORDRMNO',
       'FRTDRINKMNO', 'COFETEAMNO', 'POTATONO', 'PIZZANO', 'HRSLEEP',
       'CVDSHT'],
      dtype='object')

In [4]:
data.shape

(35115, 48)

## Data Preprocessing

In [5]:
# variables i want to use
# i want three models where
# target = cancer ever (CANCEREV)
# model 1: linear svm --> demo/health
# model 2: radial svm --> activity
# model 3: poly svm --> consumption
target = 'CANCEREV'
demo_health = ['AGE', 'SEX', 'BMICALC', 'EDUC', 'HINOTCOVE']
activity = ['MOD10DMIN', 'HRSLEEP', 'HOURSWRK']
consumption = ['FRUTNO', 'VEGENO', 'SODAPNO', 'COFETEAMNO', 'ALCANYNO', 'CIGDAYMO']
data = data[[target] + demo_health + activity + consumption]
data.columns

Index(['CANCEREV', 'AGE', 'SEX', 'BMICALC', 'EDUC', 'HINOTCOVE', 'MOD10DMIN',
       'HRSLEEP', 'HOURSWRK', 'FRUTNO', 'VEGENO', 'SODAPNO', 'COFETEAMNO',
       'ALCANYNO', 'CIGDAYMO'],
      dtype='object')

In [6]:
data.isna().sum()
# no missing values. interesting

CANCEREV      0
AGE           0
SEX           0
BMICALC       0
EDUC          0
HINOTCOVE     0
MOD10DMIN     0
HRSLEEP       0
HOURSWRK      0
FRUTNO        0
VEGENO        0
SODAPNO       0
COFETEAMNO    0
ALCANYNO      0
CIGDAYMO      0
dtype: int64

In [7]:
for col in data.columns:
    print(data[col].value_counts())

CANCEREV
1    24184
0     7464
2     3430
7       28
9        9
Name: count, dtype: int64
AGE
85     1002
64      559
67      557
68      555
66      551
       ... 
19      204
83      197
84      188
997      59
999       5
Name: count, Length: 88, dtype: int64
SEX
2    18653
1    16456
9        4
7        2
Name: count, dtype: int64
BMICALC
996.0    6833
25.8      712
26.6      690
25.1      613
24.4      481
         ... 
47.6        1
12.8        1
45.9        1
45.8        1
14.9        1
Name: count, Length: 366, dtype: int64
EDUC
0      7464
201    6351
400    6307
301    4112
501    3025
303    2561
103    1881
505    1080
302    1065
202     632
116     488
999     101
997      48
Name: count, dtype: int64
HINOTCOVE
1    32618
2     2388
9      109
Name: count, dtype: int64
MOD10DMIN
0      16159
60      5439
30      4871
45      1932
120     1399
       ...  
48         1
13         1
47         1
145        1
27         1
Name: count, Length: 70, dtype: int64
HRSLEEP
7     

In [8]:
# clean/filter the invalid values like 999, 998, etc.
data = data.replace([999, 998, 997, 996], np.nan)
data = data[data['CANCEREV'].isin([1, 2])]
data = data[data['HINOTCOVE'].isin([1, 2])]
data = data.dropna()
data.head()

Unnamed: 0,CANCEREV,AGE,SEX,BMICALC,EDUC,HINOTCOVE,MOD10DMIN,HRSLEEP,HOURSWRK,FRUTNO,VEGENO,SODAPNO,COFETEAMNO,ALCANYNO,CIGDAYMO
0,1,61.0,1,38.4,201.0,1,0.0,8,45,5.0,15.0,0.0,0.0,2.0,96
1,1,43.0,1,27.3,301.0,1,20.0,6,45,1.0,1.0,0.0,1.0,1.0,96
3,1,68.0,1,25.0,505.0,1,60.0,6,0,3.0,1.0,1.0,0.0,7.0,96
4,1,73.0,1,24.0,201.0,1,690.0,8,0,2.0,4.0,30.0,30.0,0.0,96
6,1,73.0,1,26.5,201.0,1,60.0,6,0,1.0,2.0,5.0,0.0,2.0,96


In [9]:
for col in data.columns:
    print(data[col].value_counts())

CANCEREV
1    18372
2     2752
Name: count, dtype: int64
AGE
85.0    659
68.0    450
64.0    450
67.0    437
66.0    431
       ... 
83.0    139
84.0    125
20.0    109
19.0     86
18.0     68
Name: count, Length: 68, dtype: int64
SEX
2    11087
1    10037
Name: count, dtype: int64
BMICALC
25.8    571
26.6    538
25.1    493
24.4    391
28.3    347
       ... 
46.5      1
45.4      1
16.2      1
47.7      1
44.5      1
Name: count, Length: 328, dtype: int64
EDUC
400.0    5187
201.0    4555
301.0    3165
501.0    2519
303.0    2043
103.0    1140
505.0     908
302.0     827
202.0     476
116.0     304
Name: count, dtype: int64
HINOTCOVE
1    19697
2     1427
Name: count, dtype: int64
MOD10DMIN
0.0      5736
60.0     4500
30.0     3870
45.0     1655
120.0    1147
         ... 
200.0       1
36.0        1
11.0        1
62.0        1
27.0        1
Name: count, Length: 64, dtype: int64
HRSLEEP
7     6793
8     6235
6     4345
5     1300
9     1064
10     487
4      465
12     141
3       86


In [10]:
groups = {
    'Demo/Health': (data[demo_health], data[target]),
    'Activity': (data[activity], data[target]),
    'Consumption': (data[consumption], data[target])
}

## Modeling - Linear SVM

In [11]:
param_linear = {'C': [0.001, 0.01, 0.1, 1, 10]}
linear = SVC(kernel = 'linear', class_weight = 'balanced', random_state = 5322)

for name, (X, y) in groups.items():
    print(f'\nRunning Linear SVM for {name} variables...')
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 5322)
    
    grid_linear = GridSearchCV(
        estimator = linear,
        param_grid = param_linear,
        scoring = 'f1_weighted',
        cv = 3,
        n_jobs = -1,
        verbose = 1 
        # change verbose to 3 to see [CV 3/3] END ............................C=0.01;, score=0.700 total time=  11.5s
        # to check whether the grid search params are reasonable and changing the scores
    )
    
    grid_linear.fit(x_train, y_train)
    
    print(f"Best Params for {name}: {grid_linear.best_params_}")
    print(f"Best CV Score for {name}: {grid_linear.best_score_}")
    print(classification_report(y_test, grid_linear.predict(x_test), zero_division = 0))
    print('='*50)


Running Linear SVM for Demo/Health variables...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Params for Demo/Health: {'C': 1}
Best CV Score for Demo/Health: 0.7041431680078948
              precision    recall  f1-score   support

           1       0.96      0.63      0.76      3680
           2       0.25      0.82      0.38       545

    accuracy                           0.65      4225
   macro avg       0.60      0.73      0.57      4225
weighted avg       0.87      0.65      0.71      4225


Running Linear SVM for Activity variables...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Params for Activity: {'C': 10}
Best CV Score for Activity: 0.6740188866958728
              precision    recall  f1-score   support

           1       0.93      0.59      0.72      3680
           2       0.20      0.72      0.32       545

    accuracy                           0.60      4225
   macro avg       0.57      0.65      0.52      4225
weighted avg   

Some things to note:

Precision is high for non-cancer individuals, but recall is low\
Precision is low for cancer individuals, but recall is high (lots of false positives)\
Clear class imbalance: a lot more non-cancer individuals in our dataset versus cancer individuals\

Accuracy is not that good either. Best was 70% train, 65% test\
Demographic and Activity variables seem to be decent predictors of cancer, but consumption is not (no linear patterns I guess).

In [12]:
# for plots

## Modeling - Radial

In [13]:
param_radial = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': [0.01, 1, 10, 100, 500, 1000]
}

radial = SVC(kernel = 'rbf', class_weight = 'balanced', random_state = 5322)

for name, (X, y) in groups.items():
    print(f'\nRunning RBF SVM for {name} variables...')
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 5322)
    
    grid_radial = GridSearchCV(
        estimator = radial,
        param_grid = param_radial,
        scoring = 'f1_weighted',
        cv = 3,
        n_jobs = -1,
        verbose = 1
        # change verbose to 3 to see [CV 3/3] END ............................C=0.01;, score=0.700 total time=  11.5s
        # to check whether the grid search params are reasonable and changing the scores
    )
    
    grid_radial.fit(x_train, y_train)
    
    print(f"Best Params for {name}: {grid_radial.best_params_}")
    print(f"Best CV Score for {name}: {grid_radial.best_score_}")
    print(classification_report(y_test, grid_radial.predict(x_test), zero_division = 0))
    print('='*50)


Running RBF SVM for Demo/Health variables...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Params for Demo/Health: {'C': 0.1, 'gamma': 100}
Best CV Score for Demo/Health: 0.82061161487657
              precision    recall  f1-score   support

           1       0.89      0.91      0.90      3680
           2       0.30      0.25      0.27       545

    accuracy                           0.83      4225
   macro avg       0.59      0.58      0.59      4225
weighted avg       0.81      0.83      0.82      4225


Running RBF SVM for Activity variables...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Params for Activity: {'C': 0.01, 'gamma': 1000}
Best CV Score for Activity: 0.7686026768521925
              precision    recall  f1-score   support

           1       0.91      0.76      0.83      3680
           2       0.22      0.47      0.30       545

    accuracy                           0.72      4225
   macro avg       0.56      0.61      0.5

Some things to note:

Overall, radial model did much better than linear.\
Precision is again high for non-cancer individuals, and recall was a bit higher, but still a little low (not a lot of false positives)\
Precision is still low for cancer individuals, but recall is high (still a lot of false positives)\
Again clear class imbalance: a lot more non-cancer individuals in our dataset versus cancer individuals\

Accuracy is better than linear. Best was 82% train, 83% test.\
Radial model did siginificantly better than linear model regarding consumption variables (maybe there is a decent non linear predictor(s) of cancer here).
Demographic variables seem to be more predictive of cancer than activity or consumption, but they are close.

In [14]:
# for plots

## Modeling - Polynomial SVM

In [17]:
param_poly = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1],
    'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
    'coef0': [0, 1],
    'degree': [2]
}

poly = SVC(kernel = 'poly', class_weight = 'balanced', random_state = 5322)

for name, (X, y) in groups.items():
    print(f'\nRunning Polynomial SVM for {name} variables...')
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 5322)
    
    grid_poly = GridSearchCV(
        estimator = poly,
        param_grid = param_poly,
        scoring = 'f1_weighted',
        cv = 3,
        n_jobs = -1,
        verbose = 1
        # change verbose to 3 to see [CV 3/3] END ............................C=0.01;, score=0.700 total time=  11.5s
        # to check whether the grid search params are reasonable and changing the scores
    )
    
    grid_poly.fit(x_train, y_train)
    
    print(f"Best Params for {name}: {grid_poly.best_params_}")
    print(f"Best CV Score for {name}: {grid_poly.best_score_}")
    print(classification_report(y_test, grid_poly.predict(x_test), zero_division = 0))
    print('='*50)


Running Polynomial SVM for Demo/Health variables...
Fitting 3 folds for each of 50 candidates, totalling 150 fits


Best Params for Demo/Health: {'C': 0.001, 'coef0': 0, 'degree': 2, 'gamma': 1}
Best CV Score for Demo/Health: 0.7898377746221704
              precision    recall  f1-score   support

           1       0.92      0.80      0.85      3680
           2       0.28      0.52      0.36       545

    accuracy                           0.76      4225
   macro avg       0.60      0.66      0.61      4225
weighted avg       0.84      0.76      0.79      4225


Running Polynomial SVM for Activity variables...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Params for Activity: {'C': 0.0001, 'coef0': 0, 'degree': 2, 'gamma': 0.1}
Best CV Score for Activity: 0.8090045402425227
              precision    recall  f1-score   support

           1       0.87      1.00      0.93      3680
           2       0.14      0.00      0.01       545

    accuracy                           0.87      4225
   macro avg       0.51      0.50      0.47      4225
weighted avg       0.78      0.87 