# Hackathon

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df=pd.read_csv('./Datasets/training_set_features.csv')
train_output_df=pd.read_csv('./Datasets/training_set_labels.csv')
test_df=pd.read_csv('./Datasets/test_set_features.csv')
submission_format=pd.read_csv('./Datasets/submission_format.csv')

# Understanding Data and Cleaning Data, Standardization Train data

In [3]:
train_df.head()
#Print Numbers of Rows and Columns in the Train Dataset
print("Rows,Columns:",train_df.shape)

#Finding the NUll Columns for Data Cleaning
print(train_df.isnull().sum())

Rows,Columns: (26707, 36)
respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_pover

In [4]:
#Prints Datatype of each column
train_df.dtypes

respondent_id                    int64
xyz_concern                    float64
xyz_knowledge                  float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_xyz                float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_xyz_vacc_effective     float64
opinion_xyz_risk               float64
opinion_xyz_sick_from_vacc     float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

In [5]:
#From Data and Null Values We decided to remove employment_industry and employment_education
columns_to_drop = ['employment_industry', 'employment_occupation']
train_df.drop(columns=columns_to_drop, inplace=True)
test_df.drop(columns=columns_to_drop,inplace=True)

In [6]:
#Columns of Dataframe
remaining_col=train_df.columns
for i in remaining_col:
    print(i,":",train_df[i].unique())
print(train_df.shape)

respondent_id : [    0     1     2 ... 26704 26705 26706]
xyz_concern : [ 1.  3.  2.  0. nan]
xyz_knowledge : [ 0.  2.  1. nan]
behavioral_antiviral_meds : [ 0.  1. nan]
behavioral_avoidance : [ 0.  1. nan]
behavioral_face_mask : [ 0.  1. nan]
behavioral_wash_hands : [ 0.  1. nan]
behavioral_large_gatherings : [ 0.  1. nan]
behavioral_outside_home : [ 1.  0. nan]
behavioral_touch_face : [ 1.  0. nan]
doctor_recc_xyz : [ 0. nan  1.]
doctor_recc_seasonal : [ 0. nan  1.]
chronic_med_condition : [ 0.  1. nan]
child_under_6_months : [ 0.  1. nan]
health_worker : [ 0.  1. nan]
health_insurance : [ 1. nan  0.]
opinion_xyz_vacc_effective : [ 3.  5.  4.  2.  1. nan]
opinion_xyz_risk : [ 1.  4.  3.  2.  5. nan]
opinion_xyz_sick_from_vacc : [ 2.  4.  1.  5.  3. nan]
opinion_seas_vacc_effective : [ 2.  4.  5.  3.  1. nan]
opinion_seas_risk : [ 1.  2.  4.  3.  5. nan]
opinion_seas_sick_from_vacc : [ 2.  4.  1.  5. nan  3.]
age_group : ['55 - 64 Years' '35 - 44 Years' '18 - 34 Years' '65+ Years'
 '4

In [7]:
from sklearn.impute import SimpleImputer

columns = train_df.columns
print(columns)

# Fill NaN values in categorical columns with a placeholder i.e. Mode Value
for col in columns:
    mode_value = train_df[col].mode()[0]
    imputer = SimpleImputer(strategy='constant', fill_value= mode_value)
    train_df[[col]] = imputer.fit_transform(train_df[[col]])

for col in columns:
    mode_value = test_df[col].mode()[0]
    imputer = SimpleImputer(strategy='constant', fill_value= mode_value)
    test_df[[col]] = imputer.fit_transform(test_df[[col]])

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children'],
      dtype='object')


In [8]:
object_columns = train_df.select_dtypes(include=['object']).columns
print(object_columns)

#Using One Hot Encoder to Encode the Object Data Columns
from sklearn.preprocessing import OneHotEncoder

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the OneHotEncoder on the categorical columns
onehot_encoded = onehot_encoder.fit_transform(train_df[object_columns])
onehot_encoded_test = onehot_encoder.fit_transform(test_df[object_columns])

# Create a DataFrame to view the encoded data
encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(object_columns))
encoded_test_df = pd.DataFrame(onehot_encoded_test, columns=onehot_encoder.get_feature_names_out(object_columns))

# Concatenate with the original DataFrame
train_df = pd.concat([train_df.drop(columns=object_columns), encoded_df], axis=1)
test_df = pd.concat([test_df.drop(columns=object_columns), encoded_test_df], axis=1)

# Print the DataFrame to verify
print(train_df.shape)
train_df.dtypes


Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa'],
      dtype='object')
(26707, 62)


respondent_id                            int64
xyz_concern                            float64
xyz_knowledge                          float64
behavioral_antiviral_meds              float64
behavioral_avoidance                   float64
                                        ...   
hhs_geo_region_oxchjgsf                float64
hhs_geo_region_qufhixun                float64
census_msa_MSA, Not Principle  City    float64
census_msa_MSA, Principle City         float64
census_msa_Non-MSA                     float64
Length: 62, dtype: object

In [9]:
nnull_cols=train_df.isnull().sum()
for col, count in nnull_cols.items():
    print(f"{col}: {count}")
train_df.head()

respondent_id: 0
xyz_concern: 0
xyz_knowledge: 0
behavioral_antiviral_meds: 0
behavioral_avoidance: 0
behavioral_face_mask: 0
behavioral_wash_hands: 0
behavioral_large_gatherings: 0
behavioral_outside_home: 0
behavioral_touch_face: 0
doctor_recc_xyz: 0
doctor_recc_seasonal: 0
chronic_med_condition: 0
child_under_6_months: 0
health_worker: 0
health_insurance: 0
opinion_xyz_vacc_effective: 0
opinion_xyz_risk: 0
opinion_xyz_sick_from_vacc: 0
opinion_seas_vacc_effective: 0
opinion_seas_risk: 0
opinion_seas_sick_from_vacc: 0
household_adults: 0
household_children: 0
age_group_18 - 34 Years: 0
age_group_35 - 44 Years: 0
age_group_45 - 54 Years: 0
age_group_55 - 64 Years: 0
age_group_65+ Years: 0
education_12 Years: 0
education_< 12 Years: 0
education_College Graduate: 0
education_Some College: 0
race_Black: 0
race_Hispanic: 0
race_Other or Multiple: 0
race_White: 0
sex_Female: 0
sex_Male: 0
income_poverty_<= $75,000, Above Poverty: 0
income_poverty_> $75,000: 0
income_poverty_Below Poverty: 

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,hhs_geo_region_fpwskwrf,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun,"census_msa_MSA, Not Principle City","census_msa_MSA, Principle City",census_msa_Non-MSA
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [10]:
nnull_cols_op=train_output_df.isnull().sum()
for col, count in nnull_cols_op.items():
    print(f"{col}: {count}")
train_df.head()
#Train Data Output have all the values

respondent_id: 0
xyz_vaccine: 0
seasonal_vaccine: 0


Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,hhs_geo_region_fpwskwrf,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun,"census_msa_MSA, Not Principle City","census_msa_MSA, Principle City",census_msa_Non-MSA
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [11]:
nnull_test_cols=test_df.isnull().sum()
for col, count in nnull_test_cols.items():
    print(f"{col}: {count}")
test_df.head()

print(test_df.shape)


respondent_id: 0
xyz_concern: 0
xyz_knowledge: 0
behavioral_antiviral_meds: 0
behavioral_avoidance: 0
behavioral_face_mask: 0
behavioral_wash_hands: 0
behavioral_large_gatherings: 0
behavioral_outside_home: 0
behavioral_touch_face: 0
doctor_recc_xyz: 0
doctor_recc_seasonal: 0
chronic_med_condition: 0
child_under_6_months: 0
health_worker: 0
health_insurance: 0
opinion_xyz_vacc_effective: 0
opinion_xyz_risk: 0
opinion_xyz_sick_from_vacc: 0
opinion_seas_vacc_effective: 0
opinion_seas_risk: 0
opinion_seas_sick_from_vacc: 0
household_adults: 0
household_children: 0
age_group_18 - 34 Years: 0
age_group_35 - 44 Years: 0
age_group_45 - 54 Years: 0
age_group_55 - 64 Years: 0
age_group_65+ Years: 0
education_12 Years: 0
education_< 12 Years: 0
education_College Graduate: 0
education_Some College: 0
race_Black: 0
race_Hispanic: 0
race_Other or Multiple: 0
race_White: 0
sex_Female: 0
sex_Male: 0
income_poverty_<= $75,000, Above Poverty: 0
income_poverty_> $75,000: 0
income_poverty_Below Poverty: 

In [12]:
for i in train_output_df.columns:
    print(train_output_df[i].unique())

[    0     1     2 ... 26704 26705 26706]
[0 1]
[0 1]


In [13]:
#Removing Highly Corelated Columns for Less Impact on Results

# Calculate the correlation matrix
corr_matrix = train_df.corr()

# We are interested in absolute correlation values above a certain threshold
threshold = 0.95
high_corr_pairs = np.where(np.abs(corr_matrix) > threshold)
high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y]) for x, y in zip(*high_corr_pairs) if x != y and x < y]

# Remove one column from each pair
columns_to_remove = set()
for col1, col2 in high_corr_pairs:
    columns_to_remove.add(col2)

# Remove the identified columns
train_df = train_df.drop(columns=list(columns_to_remove))
test_df = test_df.drop(columns=list(columns_to_remove))

# Display the resulting DataFrame
print("Columns  related:",high_corr_pairs)
print("Remaining columns:", train_df.columns)
print(train_df.head())

#But Columns were just complement of each other so removed 

Columns  related: [('sex_Female', 'sex_Male'), ('marital_status_Married', 'marital_status_Not Married'), ('rent_or_own_Own', 'rent_or_own_Rent')]
Remaining columns: Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
       'household_children', 'age_group_18 - 34 Years',
       'age_group_35 - 44 Years', 'age_group_45 - 54 Years',
       'age_group_55 - 64 Years', 'age_group_65+ Years', 'education_12 Years',
       'education_< 12 Years',

In [19]:
#Using Naive Bayes For Probabilities
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.metrics import accuracy_score

model = GaussianNB()

#xyz vaccine
model.fit(train_df, train_output_df['xyz_vaccine'])

probability_xyz = model.predict_proba(test_df)[:, 1]
submission_format['h1n1_vaccine']=probability_xyz.round(4)


#seasonal_vaccine

model_sv = GaussianNB()
model_sv.fit(train_df,train_output_df['seasonal_vaccine'])

probability_sv = model_sv.predict_proba(test_df)[:, 1]
submission_format['seasonal_vaccine']=probability_sv.round(4)


In [20]:
submission_format.head(20)

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.0587,0.0597
1,26708,0.0004,0.0001
2,26709,0.9219,0.974
3,26710,0.3989,0.9919
4,26711,0.3714,0.6717
5,26712,0.9352,0.9865
6,26713,0.4546,0.986
7,26714,0.0192,0.1452
8,26715,0.0023,0.1656
9,26716,0.3381,0.9892


In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
from sklearn.svm import SVC

In [18]:
from sklearn.ensemble import RandomForestClassifier