In [None]:
# import packages
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
#read data
training_features_data = pd.read_csv("../input/flu-shot-learning-h1n1-seasonal-flu-vaccines/training_set_features.csv",
                    sep=',')


training_set_labels = pd.read_csv("../input/flu-shot-learning-h1n1-seasonal-flu-vaccines/training_set_labels.csv",
                    sep=',')


test_features_data = pd.read_csv("../input/flu-shot-learning-h1n1-seasonal-flu-vaccines/test_set_features.csv",
                    sep=',')


In [None]:
print(test_features_data.shape)  
print(training_set_labels.shape) 

# **here is preprocessing for train dataset**

In [None]:
#eliminate null values

#for float types
training_features_data=training_features_data.fillna(training_features_data.mean())

#for string types
training_features_data=training_features_data.fillna('out-of-category')

In [None]:
#check no missing values are left 
training_features_data.isna().sum()

In [None]:
#encoding categorical features (str-->float)

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

enc.fit(training_features_data)
training_features_data_arr=enc.transform(training_features_data)

col_names_list=training_features_data.columns
encoded_categorical_df=pd.DataFrame(training_features_data_arr, columns=col_names_list)

In [None]:
#normalization(make all values bet. 0-1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(encoded_categorical_df)
normalized_arr=scaler.transform(encoded_categorical_df)

normalized_df=pd.DataFrame(normalized_arr, columns=col_names_list)

In [None]:
#check if data types are correct or not 
normalized_df.info()

# **here is preprocessing for test dataset**

In [None]:
#check types of test dataset
test_features_data.info()

In [None]:
#eliminate null values

#for float types
test_features_data=test_features_data.fillna(test_features_data.mean())

#for string types
test_features_data=test_features_data.fillna('out-of-category')

In [None]:
#check no missing values are left 
test_features_data.isna().sum()

In [None]:
#encoding categorical features  (str-->float)

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(test_features_data)
test_features_data_arr=enc.transform(test_features_data)

col_names_list=test_features_data.columns
test_encoded_categorical_df=pd.DataFrame(test_features_data_arr, columns=col_names_list)

In [None]:
#check data types
test_encoded_categorical_df.info()

In [None]:
#normalization(bet. 0-1)

#using minmax scaler(look up)
test_normalized_arr=scaler.transform(test_encoded_categorical_df)
test_normalized_df=pd.DataFrame(test_normalized_arr, columns=col_names_list)

# **here is regression**

In [None]:
#import sklearn methods 
from sklearn.metrics import roc_curve, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# split df to X and Y
y = training_set_labels.loc[:, 'h1n1_vaccine'].values
X = normalized_df


In [None]:
# split data into 80-20 for training set / test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

# cross-validation with 5 splits
cv = StratifiedShuffleSplit(n_splits=5, random_state = 42)

# Regressor-1: Decision Tree regressor

In [None]:
#decision tree regressor
regressor = DecisionTreeRegressor(random_state = 0)

# parameters 
parameters = {
                "criterion": ["mse", "friedman_mse", "mae"],
                "splitter": ["best","random"],
                }

# grid search for parameters
grid = GridSearchCV(estimator=regressor, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

# detailed dataframe of gridsearch
detailed_grid_results = pd.DataFrame(grid.cv_results_)
detailed_grid_results


In [None]:
# display test scores and return result string and indexes of false samples
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print AUC score
    auc = roc_auc_score(test, pred)
    str_out += ("AUC: {:.4f}\n".format(auc))
    str_out += ("\n")
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes


In [None]:
# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Regressor-2: Bayesian-Ridge

In [None]:
#Bayesian Ridge for regression 

clf_ridge = linear_model.BayesianRidge()


# parameters 
parameters = {
                'alpha_init': [None, 1],
                'lambda_init': [1, 1e-3],
            }


# grid search for parameters
grid = GridSearchCV(estimator=clf_ridge, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))

# prediction results
y_pred = grid.predict(X_test)


# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Regressor-3: SVR

In [None]:
regr = SVR(C=1.0, epsilon=0.2)

# parameters 
parameters = {
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'C': [0.01,0.1,1,10,100],
                'max_iter': [100,1000],
            }

# grid search for parameters
grid = GridSearchCV(estimator=regr, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))

# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Regressor-4: SGDRegressor

In [None]:
reg = SGDRegressor( tol=1e-3)


# parameters 
parameters = {
                'alpha': [0.0001, 0.001, 0.01, 1],
                'max_iter': [10,100,1000],
                'learning_rate': ['invscaling', 'optimal', 'adaptive'],
            }

# grid search for parameters
grid = GridSearchCV(estimator=reg, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))


# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Regressor-5: RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(random_state=0)

# parameters 
parameters = {
                'n_estimators': [20, 50, 100],
            }

# grid search for parameters
grid = GridSearchCV(estimator=rfr, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))



# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)


# t-test

>  if p-value<=0.05 --> difference of two model is significant(yani iki modelin farkı belirgin, yani iki model farklı)

>  if p-value>0.05 --> difference of two model is NOT significant(yani iki model çok farklı değil)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_absolute_error
from scipy import stats

#test_size=%20
n_splits = 5

#test_size_list=[0.50, .30, .10]

#for i in test_size_list:

sss = StratifiedShuffleSplit(n_splits=n_splits, random_state=42, test_size=0.2)

model_1 = RandomForestRegressor(random_state=0, n_estimators=100)
model_2 = SGDRegressor(alpha= 0.001, learning_rate='adaptive', max_iter=100)
model_3 = linear_model.BayesianRidge(alpha_init=None, lambda_init= 0.001)


cv_mae_1 = []
cv_mae_2 = []
cv_mae_3 = []



for X_train_list, X_test_list in sss.split(X,y):
    model_1.fit(X.loc[X_train_list], y[X_train_list])
    pred_1 = model_1.predict(X.loc[X_test_list])
    err_1 = mean_absolute_error(y[X_test_list], pred_1)
    cv_mae_1.append(err_1)


    model_2.fit(X.loc[X_train_list], y[X_train_list])
    pred_2 = model_2.predict(X.loc[X_test_list])
    err_2 = mean_absolute_error(y[X_test_list], pred_2)
    cv_mae_2.append(err_2)

    model_3.fit(X.loc[X_train_list], y[X_train_list])
    pred_3 = model_3.predict(X.loc[X_test_list])
    err_3 = mean_absolute_error(y[X_test_list], pred_3)
    cv_mae_3.append(err_3)

    

In [None]:
from scipy import stats
print(stats.ttest_rel(cv_mae_1,cv_mae_2))
print(stats.ttest_rel(cv_mae_3,cv_mae_2))
print(stats.ttest_rel(cv_mae_3,cv_mae_1))

#üç modeli karşılaştırdık; hepsi significant çıktı, en büyük olan modeli seçiyoruz, eyv:dd


In [None]:
#Bayesian Ridge for regression 

#clf_ridge = linear_model.BayesianRidge(alpha_init=None, lambda_init=0.001)
#clf_ridge.fit(X,y)

# prediction results
#y_pred = clf_ridge.predict(test_normalized_df)

#y_pred = 1/(1+np.exp(-y_pred))


In [None]:
#Random forest regressor

rfr = RandomForestRegressor(random_state=0, n_estimators=100)
rfr.fit(X,y)

# prediction results
y_pred = rfr.predict(test_normalized_df)

In [None]:
import numpy as np

np.sum(np.logical_or(np.array(y_pred) > 1, np.array(y_pred) < 0), axis=0)

In [None]:
y_pred[:10]