In [204]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Reading datasets

In [206]:
df = pd.read_csv('training_set_features.csv')
df.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [208]:
y = pd.read_csv('training_set_labels.csv')
y.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [210]:
y.isna().sum()

respondent_id       0
xyz_vaccine         0
seasonal_vaccine    0
dtype: int64

In [212]:
# Check total number of null values in each column
df.isna().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

# Data Analysis for xyz vaccine

In [214]:
# Use relevant columns
cols_for_xyz = ['xyz_concern', 'xyz_knowledge', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings',
               'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz', 'chronic_med_condition', 'child_under_6_months', 
               'health_worker', 'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc', 
               'age_group', 'income_poverty', 'hhs_geo_region', 'census_msa', 'household_adults', 'household_children']
x_xyz = df[cols_for_xyz]
x_xyz.isna().sum()

xyz_concern                       92
xyz_knowledge                    116
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
age_group                          0
income_poverty                  4423
hhs_geo_region                     0
census_msa                         0
household_adults                 249
household_children               249
dtype: int64

In [216]:
x_xyz.shape

(26707, 22)

In [218]:
# Replace null values of certain columns with 0
cols_to_use = ['behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 
               'behavioral_avoidance', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'doctor_recc_xyz', 
                'household_adults', 'household_children', 'xyz_concern', 'xyz_knowledge']
x_xyz[cols_to_use] = x_xyz[cols_to_use].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_xyz[cols_to_use] = x_xyz[cols_to_use].fillna(0)


In [220]:
# Drop certain columns because most of the values are null
x_xyz.drop(['health_insurance', 'income_poverty'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_xyz.drop(['health_insurance', 'income_poverty'], axis = 1, inplace = True)


In [222]:
x_xyz.isna().sum()

xyz_concern                      0
xyz_knowledge                    0
behavioral_avoidance             0
behavioral_face_mask             0
behavioral_wash_hands            0
behavioral_large_gatherings      0
behavioral_outside_home          0
behavioral_touch_face            0
doctor_recc_xyz                  0
chronic_med_condition            0
child_under_6_months             0
health_worker                    0
opinion_xyz_vacc_effective     391
opinion_xyz_risk               388
opinion_xyz_sick_from_vacc     395
age_group                        0
hhs_geo_region                   0
census_msa                       0
household_adults                 0
household_children               0
dtype: int64

In [224]:
x_xyz.dtypes

xyz_concern                    float64
xyz_knowledge                  float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_xyz                float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
opinion_xyz_vacc_effective     float64
opinion_xyz_risk               float64
opinion_xyz_sick_from_vacc     float64
age_group                       object
hhs_geo_region                  object
census_msa                      object
household_adults               float64
household_children             float64
dtype: object

In [226]:
# Dealing with categorical variables using ONE HOT CODING
x_xyz = pd.get_dummies(x_xyz, drop_first = True)
x_xyz.isna().sum()

xyz_concern                         0
xyz_knowledge                       0
behavioral_avoidance                0
behavioral_face_mask                0
behavioral_wash_hands               0
behavioral_large_gatherings         0
behavioral_outside_home             0
behavioral_touch_face               0
doctor_recc_xyz                     0
chronic_med_condition               0
child_under_6_months                0
health_worker                       0
opinion_xyz_vacc_effective        391
opinion_xyz_risk                  388
opinion_xyz_sick_from_vacc        395
household_adults                    0
household_children                  0
age_group_35 - 44 Years             0
age_group_45 - 54 Years             0
age_group_55 - 64 Years             0
age_group_65+ Years                 0
hhs_geo_region_bhuqouqj             0
hhs_geo_region_dqpwygqj             0
hhs_geo_region_fpwskwrf             0
hhs_geo_region_kbazzjca             0
hhs_geo_region_lrircsnp             0
hhs_geo_regi

In [228]:
# Replacing null values of certain columns with their mean values
x_xyz['opinion_xyz_vacc_effective'] = x_xyz.opinion_xyz_vacc_effective.fillna(x_xyz.opinion_xyz_vacc_effective.mean())
x_xyz['opinion_xyz_risk'] = x_xyz.opinion_xyz_risk.fillna(x_xyz.opinion_xyz_risk.mean())
x_xyz['opinion_xyz_sick_from_vacc'] = x_xyz.opinion_xyz_sick_from_vacc.fillna(x_xyz.opinion_xyz_sick_from_vacc.mean())

In [230]:
x_xyz.isna().sum()

xyz_concern                       0
xyz_knowledge                     0
behavioral_avoidance              0
behavioral_face_mask              0
behavioral_wash_hands             0
behavioral_large_gatherings       0
behavioral_outside_home           0
behavioral_touch_face             0
doctor_recc_xyz                   0
chronic_med_condition             0
child_under_6_months              0
health_worker                     0
opinion_xyz_vacc_effective        0
opinion_xyz_risk                  0
opinion_xyz_sick_from_vacc        0
household_adults                  0
household_children                0
age_group_35 - 44 Years           0
age_group_45 - 54 Years           0
age_group_55 - 64 Years           0
age_group_65+ Years               0
hhs_geo_region_bhuqouqj           0
hhs_geo_region_dqpwygqj           0
hhs_geo_region_fpwskwrf           0
hhs_geo_region_kbazzjca           0
hhs_geo_region_lrircsnp           0
hhs_geo_region_lzgpxyit           0
hhs_geo_region_mlyzmhmf     

# Logistic Regression for xyz vaccine

In [232]:
train_x_1, test_x_1, train_y_1, test_y_1 = train_test_split(x_xyz, y['xyz_vaccine'], test_size = 0.33, random_state = 42)

In [234]:
logicreg = LogisticRegression()
logicreg.fit(train_x_1, train_y_1)

In [236]:
logicreg.score(test_x_1,test_y_1)

0.8343544361243477

In [238]:
xyz_vaccine = logicreg.predict_proba(x_xyz)[:, 1] 

# Data Analysis for seasonal vaccine

In [167]:
cols_for_seas = ['behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings',
               'behavioral_outside_home', 'behavioral_touch_face', 'chronic_med_condition', 'child_under_6_months', 
               'health_worker', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 
               'age_group', 'hhs_geo_region', 'census_msa', 'household_adults', 'household_children']
x_seas = df[cols_for_seas]
x_seas.isna().sum()

behavioral_avoidance           208
behavioral_face_mask            19
behavioral_wash_hands           42
behavioral_large_gatherings     87
behavioral_outside_home         82
behavioral_touch_face          128
chronic_med_condition          971
child_under_6_months           820
health_worker                  804
opinion_seas_vacc_effective    462
opinion_seas_risk              514
opinion_seas_sick_from_vacc    537
age_group                        0
hhs_geo_region                   0
census_msa                       0
household_adults               249
household_children             249
dtype: int64

In [169]:
cols_to_use_2 = ['behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 
               'behavioral_avoidance', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 
                'household_adults', 'household_children']
x_seas[cols_to_use_2] = x_seas[cols_to_use_2].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_seas[cols_to_use_2] = x_seas[cols_to_use_2].fillna(0)


In [171]:
x_seas = pd.get_dummies(x_seas, drop_first = True)

In [173]:
x_seas.isna().sum()

behavioral_avoidance                0
behavioral_face_mask                0
behavioral_wash_hands               0
behavioral_large_gatherings         0
behavioral_outside_home             0
behavioral_touch_face               0
chronic_med_condition               0
child_under_6_months                0
health_worker                       0
opinion_seas_vacc_effective       462
opinion_seas_risk                 514
opinion_seas_sick_from_vacc       537
household_adults                    0
household_children                  0
age_group_35 - 44 Years             0
age_group_45 - 54 Years             0
age_group_55 - 64 Years             0
age_group_65+ Years                 0
hhs_geo_region_bhuqouqj             0
hhs_geo_region_dqpwygqj             0
hhs_geo_region_fpwskwrf             0
hhs_geo_region_kbazzjca             0
hhs_geo_region_lrircsnp             0
hhs_geo_region_lzgpxyit             0
hhs_geo_region_mlyzmhmf             0
hhs_geo_region_oxchjgsf             0
hhs_geo_regi

In [175]:
x_seas['opinion_seas_vacc_effective'] = x_seas.opinion_seas_vacc_effective.fillna(x_seas.opinion_seas_vacc_effective.mean())
x_seas['opinion_seas_risk'] = x_seas.opinion_seas_risk.fillna(x_seas.opinion_seas_risk.mean())
x_seas['opinion_seas_sick_from_vacc'] = x_seas.opinion_seas_sick_from_vacc.fillna(x_seas.opinion_seas_sick_from_vacc.mean())

In [177]:
x_seas.isna().sum()

behavioral_avoidance              0
behavioral_face_mask              0
behavioral_wash_hands             0
behavioral_large_gatherings       0
behavioral_outside_home           0
behavioral_touch_face             0
chronic_med_condition             0
child_under_6_months              0
health_worker                     0
opinion_seas_vacc_effective       0
opinion_seas_risk                 0
opinion_seas_sick_from_vacc       0
household_adults                  0
household_children                0
age_group_35 - 44 Years           0
age_group_45 - 54 Years           0
age_group_55 - 64 Years           0
age_group_65+ Years               0
hhs_geo_region_bhuqouqj           0
hhs_geo_region_dqpwygqj           0
hhs_geo_region_fpwskwrf           0
hhs_geo_region_kbazzjca           0
hhs_geo_region_lrircsnp           0
hhs_geo_region_lzgpxyit           0
hhs_geo_region_mlyzmhmf           0
hhs_geo_region_oxchjgsf           0
hhs_geo_region_qufhixun           0
census_msa_MSA, Principle Ci

# Logistic Regression for seasonal vaccine

In [180]:
train_x_2, test_x_2, train_y_2, test_y_2 = train_test_split(x_seas, y['seasonal_vaccine'], test_size = 0.33, random_state = 42)

In [182]:
logicreg.fit(train_x_2, train_y_2)

In [188]:
logicreg.score(test_x_2, test_y_2)

0.7612888586339914

In [190]:
seasonal_vaccine = logicreg.predict_proba(x_seas)[:,1]

# Create submission DataFrame

In [197]:
submit = pd.DataFrame({'respondent_id': df['respondent_id'],
                           'xyz_vaccine': xyz_vaccine,
                           'seasonal_vaccine': seasonal_vaccine})

In [199]:
submit.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0.03307,0.061804
1,1,0.287266,0.177134
2,2,0.036582,0.127578
3,3,0.090702,0.902824
4,4,0.059482,0.080855


In [201]:
submit.to_csv('submit.csv', index = False)