# Revised baseline based on https://www.medrxiv.org/content/10.1101/2021.10.07.21264416v1.full.pdf

In [2]:
!pip install python-javabridge
!pip install python-weka-wrapper3
!pip install arff




In [3]:
import pandas as pd
from pandas.core.frame import DataFrame
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import column_or_1d
import arff
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.attribute_selection import ASSearch
from weka.attribute_selection import ASEvaluation
from weka.attribute_selection import AttributeSelection
import weka
from typing import Tuple, Any, Union
import random
import pickle
import uuid

In [4]:
##TODO: change directory to cwd if run on colab
%cd /content/drive/MyDrive/Colab Notebooks/vogel
%ls

/content/drive/MyDrive/Colab Notebooks/vogel
 Baseline.ipynb
'Copy of Glycomics_UGA4_Day0Day28.xlsx'
'Copy of Metabolomics_UGA4_D0toDay28.xlsx'
'Copy of metabolomics_UGA4_global_median_normalized_sample_only.quantified.txt'
'Copy of metabolomics_UGA4_untargeted_median_normalized_sample_only.quantified.txt'
'Copy of Proteomics_UGA4_Day0.xlsx'
'Copy of Transcriptomics_UGA4_day0_logCPMcounts.xlsx'
 data.arff
 data_inspect.ipynb
'fluvacc metadata - UGA1-5 - 1368 entries with clear vacc status - for figures.txt'
 meta.txt


In [5]:
df = pd.read_csv("fluvacc metadata - UGA1-5 - 1368 entries with clear vacc status - for figures.txt", sep="\t")  ##TODO: put the name of the file here
df.head()

Unnamed: 0,Cohort,ID,Age,BMI,BMI_category,Gender,Race,Comorbidities,PreVacc_status,PreVacc_status_year,...,SC_category_IBV_Yam,SC_category_IBV_Vic,Composite_SC_category,Serostatus_H1N1,Serostatus_H3N2,Serostatus_IBV_Yam,Serostatus_IBV_Vic,Num_SeroPos_strains,Baseline_category_Num_SeroPos_strains,Composite_baseline
0,UGA1,A10,25,33.71,Obese,Male,White,No,Prevaccinated,Year_0,...,,,,1,1,1,1,4,High,35.0
1,UGA1,A100,47,33.37,Obese,Female,White,No,Prevaccinated,Year_0,...,,High,Low,0,0,1,0,1,Low,18.0
2,UGA1,A101,30,24.99,Normal,Male,Non-white,Yes,Naive,Year_3,...,High,High,High,0,0,1,1,2,Low,21.0
3,UGA1,A102,47,25.0,Overweight,Male,White,No,Naive,Year_3,...,Low,High,High,1,1,1,0,3,High,25.0
4,UGA1,A103,20,22.1,Normal,Female,Non-white,No,Naive,Year_3,...,High,High,High,1,1,1,1,4,High,21.0


In [6]:
for column in df:
  print(column, sorted(df[column].unique()))

Cohort ['UGA1', 'UGA2', 'UGA3', 'UGA4', 'UGA5']
ID ['A1', 'A10', 'A100', 'A101', 'A102', 'A103', 'A104', 'A105', 'A106', 'A107', 'A108', 'A109', 'A11', 'A110', 'A111', 'A112', 'A113', 'A114', 'A115', 'A116', 'A118', 'A119', 'A120', 'A122', 'A123', 'A124', 'A125', 'A126', 'A127', 'A128', 'A129', 'A13', 'A130', 'A131', 'A132', 'A133', 'A134', 'A135', 'A136', 'A137', 'A138', 'A139', 'A14', 'A140', 'A141', 'A142', 'A143', 'A144', 'A145', 'A146', 'A147', 'A148', 'A149', 'A15', 'A150', 'A151', 'A152', 'A154', 'A155', 'A156', 'A157', 'A158', 'A16', 'A160', 'A161', 'A162', 'A163', 'A165', 'A166', 'A168', 'A169', 'A17', 'A170', 'A171', 'A172', 'A174', 'A175', 'A176', 'A177', 'A178', 'A179', 'A18', 'A180', 'A181', 'A182', 'A184', 'A185', 'A186', 'A188', 'A19', 'A190', 'A191', 'A192', 'A193', 'A194', 'A195', 'A196', 'A197', 'A199', 'A2', 'A20', 'A200', 'A201', 'A203', 'A204', 'A205', 'A206', 'A207', 'A208', 'A209', 'A21', 'A210', 'A211', 'A212', 'A213', 'A214', 'A215', 'A216', 'A217', 'A218', 'A2

# PreProcessing
Some codes are reorganized for new implementations.

In [7]:
class CustomLabelEncoder(LabelEncoder):
  def __init__(self, month_one_hot=True):
      self.month_one_hot = month_one_hot
  def fit(self, y, ordering: list):  # take an ordering list as an argument
      y = column_or_1d(y, warn=True)
      assert set(ordering) == set(pd.Series(y).unique())
      self.classes_ = ordering
      return self


In [17]:
def basic_preprocessing(df, month_one_hot=True, normalize=True, exclude=True, binary_classification=True):
  df.loc[df["BMI_category"] == "Normal ", "BMI_category"] = "Normal"
  df.loc[df["BMI_category"] == "Obese ", "BMI_category"] = "Obese"
  
  df["BMI_log"] = np.log(df["BMI"])
  df["Age_log"] = np.log(df["Age"])
  def _normalize(data):
      scaler = StandardScaler()
      scaler.fit(data)
      data = scaler.transform(data)
      return data

  if normalize:
    numerical_cols = ["BMI_log", "Age_log", "Age", "BMI","D0_Titer_H1N1","D0_Titer_H3N2", "D0_Titer_IBV_Yam", "D0_Titer_IBV_Vic"]
    df[numerical_cols] = _normalize(df[numerical_cols])

  bmi_cat = df["BMI_category"]
  df2 = df.drop("BMI_category", axis=1)
  comorbidities = df["Comorbidities"]
  df2 = df2.drop("Comorbidities", axis=1)
  dose = df["Vaccine_dose"]
  df2 = df2.drop("Vaccine_dose", axis=1)
  month = None
  if not month_one_hot:
    month = df["Month_vaccinated"]
    df2 = df2.drop("Month_vaccinated", axis=1)

  pre_vac = df["PreVacc_status"]
  df2 = df2.drop("PreVacc_status", axis=1)
  pre_vac_year = df["PreVacc_status_year"]
  df2 = df2.drop("PreVacc_status_year", axis=1)
  h1n1 = df["SC_category_H1N1"]
  df2 = df2.drop("SC_category_H1N1", axis=1)
  h3n2 = df["SC_category_H3N2"]
  df2 = df2.drop("SC_category_H3N2", axis=1)
  yam = df["SC_category_IBV_Yam"]
  df2 = df2.drop("SC_category_IBV_Yam", axis=1)
  vic = df["SC_category_IBV_Vic"]
  df2 = df2.drop("SC_category_IBV_Vic", axis=1)
  comp = df["Composite_SC_category"]
  df2 = df2.drop("Composite_SC_category", axis=1)
  base = df["Baseline_category_Num_SeroPos_strains"]
  df2 = df2.drop("Baseline_category_Num_SeroPos_strains", axis=1)

  for i, column in enumerate(df2.select_dtypes(exclude=["number"]).iloc[:,2:].columns):
    temp = pd.get_dummies(df2[column], drop_first=True, prefix=column)
    df = df.drop(column, axis=1)
    if i != 0 :
      x = x.join(temp)
    else:
      x = temp
  
  # mapping from columns to correct ordering
  ordering_dict = {"BMI_category": (bmi_cat, ["Lean", "Normal", "Overweight", "Obese"]),
                 "Comorbidities": (comorbidities, ["No", "Yes"]),
                 "Vaccine_dose": (dose, ["Standard", "High"]),
                 "PreVacc_status": (pre_vac, ["Naive", "Prevaccinated"]),
                 "PreVacc_status_year": (pre_vac_year, ["Year_3", "Year_0"]),
                 "SC_category_H1N1": (h1n1, ["None", "Low", "High"]),
                 "SC_category_H3N2": (h3n2, ["None", "Low", "High"]),
                 "SC_category_IBV_Yam": (yam, ["None", "Low", "High"]),
                 "SC_category_IBV_Vic": (vic, ["None", "Low", "High"]),
                 "Composite_SC_category": (comp, ["None", "Low", "High"]),
                 "Baseline_category_Num_SeroPos_strains": (base, ["Low", "High"])}  
  if not month_one_hot:
    ordering_dict["Month_vaccinated"] = (month, ["Sep.", "Oct.", "Nov.", "Dec.", "Jan.", "Feb."])
  if binary_classification:
    ordering_dict["SC_category_H1N1"][1].pop(0)
    ordering_dict["SC_category_H3N2"][1].pop(0)
    ordering_dict["SC_category_IBV_Yam"][1].pop(0)
    ordering_dict["SC_category_IBV_Vic"][1].pop(0)
    ordering_dict["Composite_SC_category"][1].pop(0)
    h1n1.loc[h1n1 == "None"] = "Low"
    h3n2.loc[h3n2 == "None"] = "Low"
    yam.loc[yam == "None"] = "Low"
    vic.loc[vic == "None"] = "Low"
    comp.loc[comp == "None"] = "Low"

  lencoder = CustomLabelEncoder(month_one_hot)
  for key, value in ordering_dict.items():
    lencoder.fit(value[0], value[1])
    df.loc[:, key] = lencoder.transform(value[0])

  new = df.join(x)
  new_UGA4 = new[new.iloc[:,0] == "UGA4"]  # get two versions of the data: UGA4 and full version; drop the ID columns
  new_UGA4_strip = new_UGA4.iloc[:,2:]
  if exclude:
    new = new[new.iloc[:,0] != "UGA4"]
  new_strip = new.iloc[:,2:]
  return new_strip, new_UGA4_strip

In [45]:
def io_selector(data: DataFrame, 
                input_columns: Union[list, str] = None, # 'all' for all input columns. Used for feature selection
                output_column: str = None, 
                month_one_hot: bool = True,
                split_feature_label=True) -> Union[Tuple[pd.DataFrame, ...], pd.DataFrame]:
  
  
  complete_input_columns = ["Age_log", "BMI_log", "BMI_category", "Gender_Male", "Race_White", 
                              "Comorbidities", "PreVacc_status", "PreVacc_status_year", 
                             "Vaccine_dose", "D0_Titer_H1N1", 
                           "D0_Titer_H3N2", "D0_Titer_IBV_Yam", "D0_Titer_IBV_Vic"]

  default_input_columns = ["Age_log", "BMI_log", "Gender_Male", "Race_White", "Comorbidities", 
                           "PreVacc_status", "Vaccine_dose",
                           "D0_Titer_H1N1", "D0_Titer_H3N2", "D0_Titer_IBV_Yam", 
                           "D0_Titer_IBV_Vic"]
  name_mapping = {"Gender": ["Gender_Male"], "Race": ["Race_White"]}
  if month_one_hot:
    default_input_columns.extend(["Month_vaccinated_Jan.", "Month_vaccinated_Feb.",
                           "Month_vaccinated_Sep.", "Month_vaccinated_Oct.",
                           "Month_vaccinated_Nov."])
    complete_input_columns.extend(["Month_vaccinated_Jan.", "Month_vaccinated_Feb.",
                           "Month_vaccinated_Sep.", "Month_vaccinated_Oct.",
                           "Month_vaccinated_Nov."])
    name_mapping["Month_vaccinated"] = ["Month_vaccinated_Jan.",
                                       "Month_vaccinated_Feb.",
                                       "Month_vaccinated_Sep.", 
                                       "Month_vaccinated_Oct.", 
                                       "Month_vaccinated_Nov."]
  else:
    default_input_columns.append("Month_vaccinated")
    complete_input_columns.append("Month_vaccinated")
    
                           
  default_output = ["Composite_seroconversion"]

  
  if input_columns is None: 
    input_columns = default_input_columns
  elif input_columns == "all":
    input_columns = complete_input_columns
  else:
    temp = []
    for item in input_columns:
      if item in name_mapping:
        temp.extend(name_mapping[item])
      else:
        temp.append(item)
    input_columns = temp
  if output_column is None:
    output_column = default_output
  x = data[input_columns]
  y = data[output_column]
  if not split_feature_label:
    return x.join(y)
  return (x, y)


In [28]:
# handles data converting and saving. Use feature_selection_dataset_wrapper if feature selection is needed
def dataset_wrapper(train: DataFrame,
                    test: DataFrame, 
                    input_columns: Union[list,str] = None, 
                    output_column: str = None,
                    save: bool = False,  # whether to save data in cwd
                    name_extension: str = "",  # name extension for file save
                    to_numpy: bool = True  # whether to convert to numpy, set to false if feature selection is needed
                    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:  # returns X_train, y_train, X_dev, y_dev, X_test, y_test
  X_train, y_train = io_selector(train, input_columns, output_column)
  X_test, y_test = io_selector(test, input_columns, output_column)
  if save:
    X_train.to_csv("X_train_" + name_extension + ".csv", index=False)
    y_train.to_csv("y_train_" + name_extension + ".csv", index=False)
    X_test.to_csv("X_test_" + name_extension + ".csv", index=False)
    y_test.to_csv("y_test_" + name_extension + ".csv", index=False)
  if to_numpy:
    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy()
  return X_train, y_train, X_test, y_test

# Feature Selection using Weka

In [11]:
def convert_io_into_arff(df: pd.DataFrame):
  arff.dump("data.arff",
            df.values,
            relation=df.columns[-1],
            names=df.columns
            )

In [19]:
# https://github.com/fracpete/python-weka-wrapper3-examples/blob/master/src/wekaexamples/attribute_selection/attribute_selection.py
def feature_selection(filename: str = "data.arff", result_type: str = "rank"): 
  loader = Loader("weka.core.converters.ArffLoader")
  data = loader.load_file(filename)
  data.class_is_last()
  evaluation = ASEvaluation(classname="weka.attributeSelection.WrapperSubsetEval", options=["-B", "weka.classifiers.functions.LinearRegression", "-F", "10"])
  attsel = AttributeSelection()
  attsel.folds(10)
  attsel.crossvalidation(True)
  attsel.evaluator(evaluation)
  if result_type == "filtered":
    search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "7"])
    attsel.search(search)
    attsel.select_attributes(data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes (as numpy array): " + str(attsel.selected_attributes))
    print("attributes (as list): " + str(list(attsel.selected_attributes)))
    print("result string:\n" + attsel.results_string)
    print("transformed header:\n" + str(evaluation.transformed_header()))
    print("\ntransformed data:\n" + str(evaluation.transformed_data(data)))
    print("\nconvert instance:\n" + str(evaluation.convert_instance(data.get_instance(0))))


  elif result_type == "rank":
    search = ASSearch(classname="weka.attributeSelection.Ranker")
    attsel = AttributeSelection()
    attsel.ranking(True)
    attsel.search(search)
    attsel.select_attributes(data)
    print("ranked attributes:\n" + str(attsel.ranked_attributes))
    print("result string:\n" + attsel.results_string)


In [31]:
def feature_selection_dataset_wrapper(train: DataFrame,
                    test: DataFrame, 
                    input_columns: Union[list,str] = None, 
                    output_column: str = None,
                    result_type: str = "rank", # result_type: rank: ranking of all attribute; filtered: resulting attributes
                    save: bool = False,  # whether to save data in cwd
                    name_extension: str = "",  # name extension for file save
                    to_numpy: bool = True  # whether to convert to numpy, set to false if feature selection is needed
                    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    
  train_df = io_selector(train, input_columns, output_column, split_feature_label=False)
  test_df = io_selector(test, input_columns, output_column, split_feature_label=False)
  convert_io_into_arff(train_df)
  feature_selection(result_type=result_type)
  
  
  
  '''if save:
    X_train.to_csv("X_train_" + name_extension + ".csv", index=False)
    y_train.to_csv("y_train_" + name_extension + ".csv", index=False)
    X_test.to_csv("X_test_" + name_extension + ".csv", index=False)
    y_test.to_csv("y_test_" + name_extension + ".csv", index=False)
  if to_numpy:
    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy()
  return X_train, y_train, X_test, y_test'''

In [14]:
jvm.start()

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['/usr/local/lib/python3.7/dist-packages/javabridge/jars/rhino-1.7R4.jar', '/usr/local/lib/python3.7/dist-packages/javabridge/jars/runnablequeue.jar', '/usr/local/lib/python3.7/dist-packages/javabridge/jars/cpython.jar', '/usr/local/lib/python3.7/dist-packages/weka/lib/python-weka-wrapper.jar', '/usr/local/lib/python3.7/dist-packages/weka/lib/weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled


In [22]:
df = pd.read_csv("fluvacc metadata - UGA1-5 - 1368 entries with clear vacc status - for figures.txt", sep="\t")  ##TODO: put the name of the file here
train_df, test_df = basic_preprocessing(df, normalize=False)
feature_selection_dataset_wrapper(train_df, test_df, input_columns="all", output_column="Composite_SC_category", result_type="filtered")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


# attributes: 6
attributes (as numpy array): [ 6  9 11 12 15 16 18]
attributes (as list): [6, 9, 11, 12, 15, 16, 18]
result string:


=== Attribute Selection on all input data ===

Search Method:
	Best first.
	Start set: no attributes
	Search direction: forward
	Stale search after 7 node expansions
	Total number of subsets evaluated: 166
	Merit of best subset found:    0.312

Attribute Subset Evaluator (supervised, Class (numeric): 19 Composite_SC_category):
	Wrapper Subset Evaluator
	Learning scheme: weka.classifiers.functions.LinearRegression
	Scheme options: -S 0 -R 1.0E-8 -num-decimal-places 4 
	Subset evaluation: RMSE
	Number of folds for accuracy estimation: 10

Selected attributes: 7,10,12,13,16,17 : 6
                     PreVacc_status
                     D0_Titer_H1N1
                     D0_Titer_IBV_Yam
                     D0_Titer_IBV_Vic
                     Month_vaccinated_Sep.
                     Month_vaccinated_Oct.


=== Attribute selection 10 fold cross-validati

In [25]:
train_df["Composite_SC_category"].unique()

array([0, 1])

In [33]:
from sklearn.linear_model import LinearRegression, SGDRegressor, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

def train_wrapper(X_train: np.ndarray,  # dataset
                  y_train: np.ndarray,
                # Universal parameters
                method: str = None,  # model name
                random_state: int = 10,  # random seed
                model_path: str = None,  # path to save the model, no saving if None
                n_jobs: int = -1,  # number of CPUs to use for some models. -1 means all processors
                max_iter: int = 10000,  # max number of iterations for some models, set to -1 for SV to impose no limits 
                class_weight: Union[dict, str] = None,  # weight of each class or balanced
                penalty: str = "l2",  # logistic has the option of no penalty "None"
                # Model specific parameters:
                # SGD Regressor/Classifier
                loss: str = "squared_error",
                alpha: float = 0.01,
                l1_ratio: float = 0.15,
                SGD_epsilon: float = 0.1,
                learning_rate: str = "invscaling",
                eta0: float = 1e-2,
                power_t: float = 0.25,
                average: Union[bool, int] = False,
                # KNN Regressor/Classifier
                n_neighbors: int = 5,
                weights: str = "uniform",
                algorithm: str = "brute",
                leaf_size: int = 30,
                p: int = 2,
                metric: Union[str, callable] = "minkowski",
                metric_params: dict = None,
                # Logistic Regression
                dual: bool = False,
                C: float = 1.0,
                intercept_scaling: float = 1,
                solver: str = "lbfgs",
                multi_class: str = "auto",
                # Random Forest Classifier/Regressor
                n_estimates: int = 100,
                criterion: str = "squared_error",
                max_depth: int = None,
                min_sample_split: Union[int, float] = 2,
                min_samples_leaf: Union[int, float] = 1,
                min_weight_fraction_leaf: float = 0.0,
                max_features: Union[str, int, float] = "auto",
                max_leaf_nodes: int = None,
                min_impurity_decrease: float = 0.0,
                bootstrap: bool = True,
                oob_score: bool = True,
                ccp_alpha: float = 0.0,
                max_samples: Union[int,float] = None,
                # SVC/SVR
                kernel: str = "rbf",
                degree: int = 3,
                gamma: str = "scale",
                coef0: float = 0.0,
                shrinking: bool = True,
                probability: bool = False,
                decision_fuction_shape: str = "ovr",
                break_ties: bool = False,
                SVC_epsilon: float = 0.1,
                ):  # returns a trained model
  random.seed(random_state)
  model = LinearRegression()
  if method == "LR":
    model = LinearRegression()
  elif method == "SGDR":
    model = SGDRegressor(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter, verbose=0, epsilon=SGD_epsilon, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, average=average)
  elif method == "KNNR":
    model = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, leaf_size=leaf_size, p=p, metric=metric)
  elif method == "SGDC":
    model = SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter, verbose=0, epsilon=SGD_epsilon, n_jobs=n_jobs, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, class_weight=class_weight, average=average)
  elif method == "KNNC":
    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, leaf_size=leaf_size, p=p, metric=metric)
  elif method == "LOGR":
    model = LogisticRegression(penalty=penalty, dual=dual, C=C, intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=1, n_jobs=n_jobs, l1_ratio=l1_ratio)
  elif model == "RFC":
    model = RandomForestClassifier(n_estimators=n_estimates, criterion=criterion, max_depth=max_depth, min_samples_split=min_sample_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=1, class_weight=class_weight, ccp_alpha=ccp_alpha, max_samples=max_samples)
  elif model == "RFR":
    model = RandomForestRegressor(n_estimators=n_estimates, criterion=criterion, max_depth=max_depth, min_samples_split=min_sample_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=1, ccp_alpha=ccp_alpha, max_samples=max_samples)
  elif method == "SVC":
    model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, class_weight=class_weight, verbose=1, max_iter=max_iter, decision_function_shape=decision_fuction_shape, break_ties=break_ties, random_state=random_state)
  elif method == "SVR":
    model = SVR(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, C=C, shrinking=shrinking, epsilon=SVC_epsilon, verbose=1, max_iter=max_iter)
  model.fit(X_train,y_train)
  if model_path:
    with open(method + uuid.uuid1()[:6] + ".pkl", "wb") as f:
      pickle.dump(model, f)
  return model

Feature selection normalized:

In [46]:
df = pd.read_csv("fluvacc metadata - UGA1-5 - 1368 entries with clear vacc status - for figures.txt", sep="\t")  ##TODO: put the name of the file here
train_df, test_df = basic_preprocessing(df, normalize=True)
X_train, y_train, X_test, y_test = dataset_wrapper(train_df, test_df, input_columns=["PreVacc_status", 
                                                                                     "D0_Titer_H1N1",  
                                                                                     "D0_Titer_IBV_Yam", 
                                                                                     "D0_Titer_IBV_Vic", 
                                                                                     "Month_vaccinated_Jan.",
                                                                                     "Month_vaccinated_Feb.",
                                                                                     "Month_vaccinated_Sep.",
                                                                                     "Month_vaccinated_Oct.",
                                                                                     "Month_vaccinated_Nov."], 
                                                   output_column= "Composite_SC_category")
model_name_list = ["LOGR", "KNNC", "SGDC", "RFC", "SVC"]
for model_name in model_name_list:
  model = train_wrapper(X_train, y_train, method=model_name)
  print(model_name,"has accuracy ", model.score(X_test, y_test))

LOGR has accuracy  0.7863636363636364
KNNC has accuracy  0.7818181818181819
SGDC has accuracy  0.7863636363636364
RFC has accuracy  0.2112751986670537
[LibSVM]SVC has accuracy  0.7863636363636364


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  "(penalty={})".format(self.penalty)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


feature selection unnormalized:

In [48]:
df = pd.read_csv("fluvacc metadata - UGA1-5 - 1368 entries with clear vacc status - for figures.txt", sep="\t")  ##TODO: put the name of the file here
train_df, test_df = basic_preprocessing(df, normalize=False)
X_train, y_train, X_test, y_test = dataset_wrapper(train_df, test_df, input_columns=["PreVacc_status", 
                                                                                     "D0_Titer_H1N1",  
                                                                                     "D0_Titer_IBV_Yam", 
                                                                                     "D0_Titer_IBV_Vic", 
                                                                                     "Month_vaccinated_Jan.",
                                                                                     "Month_vaccinated_Feb.",
                                                                                     "Month_vaccinated_Sep.",
                                                                                     "Month_vaccinated_Oct.",
                                                                                     "Month_vaccinated_Nov."], 
                                                   output_column= "Composite_SC_category")
model_name_list = ["LOGR", "KNNC", "SGDC", "RFC", "SVC"]
for model_name in model_name_list:
  model = train_wrapper(X_train, y_train, method=model_name)
  print(model_name,"has accuracy ", model.score(X_test, y_test))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  "(penalty={})".format(self.penalty)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished


LOGR has accuracy  0.7863636363636364
KNNC has accuracy  0.7090909090909091
SGDC has accuracy  0.7181818181818181
RFC has accuracy  0.2112751986670538
[LibSVM]SVC has accuracy  0.7181818181818181


default features normalized:

In [47]:
df = pd.read_csv("fluvacc metadata - UGA1-5 - 1368 entries with clear vacc status - for figures.txt", sep="\t")  ##TODO: put the name of the file here
train_df, test_df = basic_preprocessing(df, normalize=True)
X_train, y_train, X_test, y_test = dataset_wrapper(train_df, test_df, 
                                                   output_column= "Composite_SC_category")
model_name_list = ["LOGR", "KNNC", "SGDC", "RFC", "SVC"]
for model_name in model_name_list:
  model = train_wrapper(X_train, y_train, method=model_name)
  print(model_name,"has accuracy ", model.score(X_test, y_test))

LOGR has accuracy  0.8022727272727272
KNNC has accuracy  0.759090909090909
SGDC has accuracy  0.7863636363636364
RFC has accuracy  0.2298502245105043
[LibSVM]SVC has accuracy  0.7818181818181819


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  "(penalty={})".format(self.penalty)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


default features unnormalized:

In [39]:
df = pd.read_csv("fluvacc metadata - UGA1-5 - 1368 entries with clear vacc status - for figures.txt", sep="\t")  ##TODO: put the name of the file here
train_df, test_df = basic_preprocessing(df, normalize=False)
X_train, y_train, X_test, y_test = dataset_wrapper(train_df, test_df, 
                                                   output_column= "Composite_SC_category")
model_name_list = ["LOGR", "KNNC", "SGDC", "RFC", "SVC"]
for model_name in model_name_list:
  model = train_wrapper(X_train, y_train, method=model_name)
  print(model_name,"has accuracy ", model.score(X_test, y_test))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  "(penalty={})".format(self.penalty)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.9s finished


LOGR has accuracy  0.8022727272727272
KNNC has accuracy  0.6863636363636364
SGDC has accuracy  0.2818181818181818
RFC has accuracy  0.22985022451050507
[LibSVM]SVC has accuracy  0.7181818181818181


NO log transform, normalized:

In [44]:
df = pd.read_csv("fluvacc metadata - UGA1-5 - 1368 entries with clear vacc status - for figures.txt", sep="\t")  ##TODO: put the name of the file here
train_df, test_df = basic_preprocessing(df, normalize=True)
X_train, y_train, X_test, y_test = dataset_wrapper(train_df, test_df, input_columns= ["Age", "BMI", "Gender_Male", "Race_White", "Comorbidities", 
                           "PreVacc_status", "Vaccine_dose",
                           "D0_Titer_H1N1", "D0_Titer_H3N2", "D0_Titer_IBV_Yam", 
                           "D0_Titer_IBV_Vic"],
                                                   output_column= "Composite_SC_category")
model_name_list = ["LOGR", "KNNC", "SGDC", "RFC", "SVC"]
for model_name in model_name_list:
  model = train_wrapper(X_train, y_train, method=model_name)
  print(model_name,"has accuracy ", model.score(X_test, y_test))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  "(penalty={})".format(self.penalty)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


LOGR has accuracy  0.7909090909090909
KNNC has accuracy  0.775
SGDC has accuracy  0.7840909090909091
RFC has accuracy  0.19808546712389707
[LibSVM]SVC has accuracy  0.7795454545454545


No log transform unnormalized:

In [41]:
df = pd.read_csv("fluvacc metadata - UGA1-5 - 1368 entries with clear vacc status - for figures.txt", sep="\t")  ##TODO: put the name of the file here
train_df, test_df = basic_preprocessing(df, normalize=False)
X_train, y_train, X_test, y_test = dataset_wrapper(train_df, test_df, input_columns= ["Age", "BMI", "Gender_Male", "Race_White", "Comorbidities", 
                           "PreVacc_status", "Vaccine_dose",
                           "D0_Titer_H1N1", "D0_Titer_H3N2", "D0_Titer_IBV_Yam", 
                           "D0_Titer_IBV_Vic"],
                                                   output_column= "Composite_SC_category")
model_name_list = ["LOGR", "KNNC", "SGDC", "RFC", "SVC"]
for model_name in model_name_list:
  model = train_wrapper(X_train, y_train, method=model_name)
  print(model_name,"has accuracy ", model.score(X_test, y_test))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  "(penalty={})".format(self.penalty)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


LOGR has accuracy  0.7931818181818182
KNNC has accuracy  0.7272727272727273
SGDC has accuracy  0.41363636363636364
RFC has accuracy  0.19808546712389874
[LibSVM]SVC has accuracy  0.7181818181818181
