In [6]:
import numpy as np
import pandas as pd

In [7]:
import pickle

In [12]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
# Now we will work on test set features

df_test = pd.read_csv("test_set_features.csv")

In [3]:
# we will apply same transformations as train set

df_test = df_test.drop(['health_insurance', 'employment_industry', 'employment_occupation'], axis = 1)

In [4]:
for column in df_test.columns:
    if df_test[column].isnull().any():  # Check if the column has missing values
        mode_value = df_test[column].mode()[0]  # Calculate the mode of the column
        df_test[column].fillna(mode_value, inplace=True)  # Impute missing values with the mode

In [5]:
columns_to_encode = df_test.select_dtypes(exclude=['number']).columns.tolist()

In [8]:
file_path = 'encoder.pkl'

# Import the encoder object
with open(file_path, 'rb') as file:
    encoder = pickle.load(file)

In [25]:
# Subset the columns to be encoded
df_to_encode = df_test[columns_to_encode]

# Fit and transform the data
encoded_data = encoder.fit_transform(df_to_encode)


# Convert the transformed data to a DataFrame
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(columns_to_encode))

# Concatenate the encoded DataFrame with the original DataFrame
df_encoded = pd.concat([df_test, encoded_df], axis=1)

In [26]:
# drop the original columns which will not be numerical
df_encoded.drop(columns_to_encode, axis= 1, inplace= True)

In [27]:
# checking for any non-numeric feature
df_encoded.select_dtypes(exclude=['number']).columns 

Index([], dtype='object')

In [28]:
# Import the prediction objects
with open("h1n1_classifier", 'rb') as file:
    h1n1_classifier = pickle.load(file)

with open("seasonal_classifier", 'rb') as file:
    seasonal_classifier = pickle.load(file)    

In [29]:
h1n1_vaccine = h1n1_classifier.predict(df_encoded.drop(['respondent_id'], axis= 1))
h1n1_vaccine = pd.Series(h1n1_vaccine)
df_encoded["h1n1_vaccine"] = h1n1_vaccine

In [30]:
seasonal_vaccine = seasonal_classifier.predict(df_encoded.drop(['respondent_id','h1n1_vaccine'], axis= 1))
seasonal_vaccine = pd.Series(seasonal_vaccine)
df_encoded["seasonal_vaccine"] = seasonal_vaccine

In [34]:
df_encoded[['respondent_id','h1n1_vaccine', 'seasonal_vaccine']].astype('float64').to_csv("submission1.csv", index= False)