In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


In [49]:
df = pd.read_csv('/content/training_set_features.csv')
Y = pd.read_csv('/content/training_set_labels.csv')
Y.drop('respondent_id', axis=1, inplace=True)
y_xyz = Y['xyz_vaccine']
y_seas = Y['seasonal_vaccine']

In [50]:
df.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [51]:
from sklearn.preprocessing import LabelEncoder
def encode_missing_columns(df, col):
    le = LabelEncoder()

    unique_without_nan = pd.Series([i for i in df[col].unique() if type(i) == str])
    le.fit(unique_without_nan)

    df[col] = df[col].apply(lambda x: le.transform([x])[0] if type(x) == str else x)

In [52]:
txt_cat_col = ['education', 'marital_status', 'income_poverty', 'employment_status', 'rent_or_own']
for col in txt_cat_col:
    encode_missing_columns(df, col)

In [53]:
df = df.drop(['employment_industry', 'employment_occupation'], axis=1)

In [54]:
df.shape

(26707, 34)

In [55]:
df = df.drop(['respondent_id'], axis=1)

In [56]:
text_cols = df.select_dtypes(include=['object']).columns
text_cols

Index(['age_group', 'race', 'sex', 'hhs_geo_region', 'census_msa'], dtype='object')

In [57]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
num_cols

Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'education',
       'income_poverty', 'marital_status', 'rent_or_own', 'employment_status',
       'household_adults', 'household_children'],
      dtype='object')

In [58]:
imputer = SimpleImputer(strategy='most_frequent')
df[num_cols] = imputer.fit_transform(df[num_cols])

In [59]:
for col in df.columns:
    print(col, df[col].unique())

xyz_concern [1. 3. 2. 0.]
xyz_knowledge [0. 2. 1.]
behavioral_antiviral_meds [0. 1.]
behavioral_avoidance [0. 1.]
behavioral_face_mask [0. 1.]
behavioral_wash_hands [0. 1.]
behavioral_large_gatherings [0. 1.]
behavioral_outside_home [1. 0.]
behavioral_touch_face [1. 0.]
doctor_recc_xyz [0. 1.]
doctor_recc_seasonal [0. 1.]
chronic_med_condition [0. 1.]
child_under_6_months [0. 1.]
health_worker [0. 1.]
health_insurance [1. 0.]
opinion_xyz_vacc_effective [3. 5. 4. 2. 1.]
opinion_xyz_risk [1. 4. 3. 2. 5.]
opinion_xyz_sick_from_vacc [2. 4. 1. 5. 3.]
opinion_seas_vacc_effective [2. 4. 5. 3. 1.]
opinion_seas_risk [1. 2. 4. 3. 5.]
opinion_seas_sick_from_vacc [2. 4. 1. 5. 3.]
age_group ['55 - 64 Years' '35 - 44 Years' '18 - 34 Years' '65+ Years'
 '45 - 54 Years']
education [1. 0. 2. 3.]
race ['White' 'Black' 'Other or Multiple' 'Hispanic']
sex ['Female' 'Male']
income_poverty [2. 0. 1.]
marital_status [1. 0.]
rent_or_own [0. 1.]
employment_status [1. 0. 2.]
hhs_geo_region ['oxchjgsf' 'bhuqouqj

In [60]:
df=pd.get_dummies(df, text_cols)

In [61]:
def convert_bool_to_binary(df):
  for col in df.select_dtypes(include='bool').columns:
    df[col] = df[col].astype(float)
  return df

df = convert_bool_to_binary(df)


In [62]:
df.dtypes

xyz_concern                            float64
xyz_knowledge                          float64
behavioral_antiviral_meds              float64
behavioral_avoidance                   float64
behavioral_face_mask                   float64
behavioral_wash_hands                  float64
behavioral_large_gatherings            float64
behavioral_outside_home                float64
behavioral_touch_face                  float64
doctor_recc_xyz                        float64
doctor_recc_seasonal                   float64
chronic_med_condition                  float64
child_under_6_months                   float64
health_worker                          float64
health_insurance                       float64
opinion_xyz_vacc_effective             float64
opinion_xyz_risk                       float64
opinion_xyz_sick_from_vacc             float64
opinion_seas_vacc_effective            float64
opinion_seas_risk                      float64
opinion_seas_sick_from_vacc            float64
education    

In [63]:
df.isna().sum()

xyz_concern                            0
xyz_knowledge                          0
behavioral_antiviral_meds              0
behavioral_avoidance                   0
behavioral_face_mask                   0
behavioral_wash_hands                  0
behavioral_large_gatherings            0
behavioral_outside_home                0
behavioral_touch_face                  0
doctor_recc_xyz                        0
doctor_recc_seasonal                   0
chronic_med_condition                  0
child_under_6_months                   0
health_worker                          0
health_insurance                       0
opinion_xyz_vacc_effective             0
opinion_xyz_risk                       0
opinion_xyz_sick_from_vacc             0
opinion_seas_vacc_effective            0
opinion_seas_risk                      0
opinion_seas_sick_from_vacc            0
education                              0
income_poverty                         0
marital_status                         0
rent_or_own     

In [64]:
df.shape

(26707, 52)

In [65]:
Y = pd.read_csv('/content/training_set_labels.csv')
Y.drop('respondent_id', axis=1, inplace=True)
y_xyz = Y['xyz_vaccine']
y_seas = Y['seasonal_vaccine']

In [66]:
X_train, X_test, y_train_xyz, y_test_xyz = train_test_split(df, y_xyz, test_size = 0.3)
_, _, y_train_seas, y_test_seas = train_test_split(df, y_seas, test_size = 0.3)

In [67]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [68]:
svm_xyz = SVC(probability=True)
svm_xyz.fit(X_train_scaled, y_train_xyz)

In [69]:
y_pred_xyz = svm_xyz.predict(X_test_scaled)

print(roc_auc_score(y_test_xyz, y_pred_xyz))

print(classification_report(y_test_xyz, y_pred_xyz))

0.6904391376089629
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      6276
           1       0.71      0.43      0.54      1737

    accuracy                           0.84      8013
   macro avg       0.78      0.69      0.72      8013
weighted avg       0.83      0.84      0.82      8013



In [70]:
svm_seas = SVC(probability=True)
svm_seas.fit(X_train_scaled, y_train_seas)

In [71]:
y_pred_seas = svm_seas.predict(X_test_scaled)

print(roc_auc_score(y_test_seas, y_pred_seas))

print(classification_report(y_test_seas, y_pred_seas))

0.5031290955808503
              precision    recall  f1-score   support

           0       0.53      0.76      0.62      4201
           1       0.48      0.24      0.32      3812

    accuracy                           0.52      8013
   macro avg       0.50      0.50      0.47      8013
weighted avg       0.51      0.52      0.48      8013



In [72]:
X_testset = pd.read_csv('/content/test_set_features.csv')

In [73]:
txt_cat_col = ['education', 'marital_status', 'income_poverty', 'employment_status', 'rent_or_own']
for col in txt_cat_col:
    encode_missing_columns(X_testset, col)

In [74]:
X_testset = X_testset.drop(['employment_industry', 'employment_occupation'], axis=1)

In [75]:
Id = X_testset['respondent_id']
X_testset = X_testset.drop(['respondent_id'], axis=1)

In [76]:
text_cols = X_testset.select_dtypes(include=['object']).columns
num_cols = X_testset.select_dtypes(include=['int64', 'float64']).columns


In [77]:
imputer = SimpleImputer(strategy='most_frequent')
X_testset[num_cols] = imputer.fit_transform(X_testset[num_cols])

In [78]:
X_testset=pd.get_dummies(X_testset, text_cols)

In [79]:
def convert_bool_to_binary(X_testset):
  for col in X_testset.select_dtypes(include='bool').columns:
    X_testset[col] = X_testset[col].astype(float)
  return X_testset

X_testset = convert_bool_to_binary(X_testset)


In [80]:
X_testset_scaled = scaler.transform(X_testset)

In [81]:
y_pred_prob_xyz = svm_xyz.predict_proba(X_testset_scaled)[:, 1]
y_pred_prob_xyz

array([0.11813804, 0.11347391, 0.28859645, ..., 0.16242124, 0.05224217,
       0.47285147])

In [82]:
y_pred_prob_seas = svm_seas.predict_proba(X_testset_scaled)[:, 1]
y_pred_prob_seas

array([0.46869831, 0.45844204, 0.46321442, ..., 0.45568192, 0.46122158,
       0.45741454])

In [83]:
result = pd.DataFrame({
    'respondent_id': Id,
    'h1n1_vaccine': y_pred_prob_xyz,
    'seasonal_vaccine': y_pred_prob_seas
})

In [84]:
result

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.118138,0.468698
1,26708,0.113474,0.458442
2,26709,0.288596,0.463214
3,26710,0.715865,0.463339
4,26711,0.254810,0.463836
...,...,...,...
26703,53410,0.382815,0.457776
26704,53411,0.153417,0.469962
26705,53412,0.162421,0.455682
26706,53413,0.052242,0.461222


In [85]:
result.to_csv('submission.csv', index=False)

In [86]:
from google.colab import files
files.download('/content/submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>