In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import numpy as np
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

In [2]:
data = pd.read_csv("seasonal.csv")

In [3]:
data.head()

Unnamed: 0,respondent_id,medicine_taken,reccomendation,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,Target
0,0,0.0,0.0,2.0,1.0,2.0,0
1,1,0.0,0.0,4.0,2.0,4.0,1
2,2,0.0,0.0,4.0,1.0,2.0,0
3,3,0.0,1.0,5.0,4.0,1.0,1
4,4,0.0,0.0,3.0,1.0,4.0,0


In [4]:
data["chronic"]=pd.read_csv("training_set_features.csv").chronic_med_condition

In [5]:
data.chronic = data.chronic.fillna(data.chronic.mode()[0])

In [6]:
data.chronic.isnull().sum()

0

In [7]:
features = data[['medicine_taken','reccomendation','opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc',"chronic"]]
target = data['Target']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [9]:
X_test

Unnamed: 0,medicine_taken,reccomendation,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,chronic
3578,0.0,0.0,4.0,1.0,1.0,0.0
17410,0.0,0.0,1.0,1.0,1.0,1.0
17863,0.0,0.0,4.0,1.0,1.0,1.0
26051,0.0,0.0,5.0,4.0,5.0,0.0
18868,0.0,0.0,5.0,4.0,4.0,0.0
17148,0.0,1.0,4.0,2.0,1.0,1.0
6425,0.0,0.0,5.0,5.0,2.0,0.0
22989,0.0,0.0,4.0,4.0,1.0,1.0
11500,0.0,0.0,5.0,5.0,1.0,0.0
2383,0.0,1.0,2.0,1.0,4.0,1.0


In [10]:
model = LogisticRegression(solver = "lbfgs",max_iter=1000)
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7463276836158192


In [13]:
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

Precision: 0.7412410299704517


In [14]:
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f'ROC AUC: {roc_auc}')

ROC AUC: 0.8239883648067448


In [15]:
y_pred_prob

array([0.22304638, 0.04406783, 0.22624986, ..., 0.41181377, 0.82067155,
       0.67892631])

In [16]:
test = pd.read_csv("test_set_features.csv")

In [17]:
test.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,1.0,1.0,5.0,1.0,1.0,35 - 44 Years,College Graduate,Hispanic,Female,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,4.0,1.0,1.0,18 - 34 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,5.0,4.0,2.0,5.0,4.0,4.0,55 - 64 Years,College Graduate,White,Male,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,4.0,2.0,2.0,4.0,4.0,2.0,65+ Years,12 Years,White,Female,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,2.0,4.0,4.0,4.0,2.0,35 - 44 Years,12 Years,Black,Female,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


In [18]:
test = test.loc[:,['behavioral_antiviral_meds','doctor_recc_seasonal','opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc','chronic_med_condition']]

In [19]:
test.head()

Unnamed: 0,behavioral_antiviral_meds,doctor_recc_seasonal,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,chronic_med_condition
0,0.0,0.0,5.0,1.0,1.0,0.0
1,0.0,0.0,4.0,1.0,1.0,0.0
2,0.0,0.0,5.0,4.0,4.0,0.0
3,0.0,1.0,4.0,4.0,2.0,1.0
4,1.0,0.0,4.0,4.0,2.0,0.0


In [20]:
test.isnull().sum()

behavioral_antiviral_meds        79
doctor_recc_seasonal           2160
opinion_seas_vacc_effective     452
opinion_seas_risk               499
opinion_seas_sick_from_vacc     521
chronic_med_condition           932
dtype: int64

In [21]:
test.rename(columns = {'behavioral_antiviral_meds':'medicine_taken','doctor_recc_seasonal':'reccomendation','chronic_med_condition':'chronic'},inplace=True)

In [23]:
test.fillna({'medicine_taken':test.medicine_taken.mode()[0],'reccomendation':test.reccomendation.mode()[0],'opinion_seas_vacc_effective':test.opinion_seas_vacc_effective.mode()[0],'opinion_seas_risk':test.opinion_seas_risk.mode()[0],'opinion_seas_sick_from_vacc':test.opinion_seas_sick_from_vacc.mode()[0],'chronic':test.chronic.mode()[0]},inplace=True)

In [24]:
y_ = model.predict(test)
y_prob = model.predict_proba(test)[:, 1]

In [25]:
y_prob.size

26708

In [27]:
abcd = pd.read_csv('submission_file.csv')

In [28]:
abcd['seasonal_vaccine']=pd.Series(y_prob)

In [29]:
abcd.head(10)

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.161887,0.347008
1,26708,0.077481,0.223046
2,26709,0.403502,0.534904
3,26710,0.41959,0.82392
4,26711,0.140895,0.452457
5,26712,0.272764,0.808761
6,26713,0.40054,0.343737
7,26714,0.119965,0.223046
8,26715,0.041355,0.134263
9,26716,0.135718,0.663878


In [None]:
abcd.to_csv('submission_format.csv',index=False)