In [1]:
import pandas as pd
import numpy as np
import xgboost
from google.colab import drive
import pickle
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, average_precision_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import time
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

In [2]:
%%javascript
function ClickConnect(){
console.log("Working");
document.querySelector("colab-toolbar-button#connect").click()
}setInterval(ClickConnect,60000)

<IPython.core.display.Javascript object>

In [3]:
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [4]:
reader_files_list=open("/content/gdrive/MyDrive/Studying/Computational_Learning/HW4/Data/files.txt")
files_list=reader_files_list.read().splitlines()
filepath=files_list[0]

In [5]:
def Get_Results(pred_prob, pred, y_true, num_classes):
    #np.seterr(divide='ignore', invalid='ignore') #ignore divide by zero warnings

    unique_labels=np.arange(num_classes)
    cnf_matrix = confusion_matrix(y_true, pred)

    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)

    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    TPR = np.nanmean(TP/(TP+FN)) #TPR
    #PPV = TP/(TP+FP) #Precision
    FPR = np.nanmean(FP/(FP+TN)) #FPR
    accuracy=accuracy_score(y_true, pred)
    roc_auc_count=0
    roc_list=[]
    pr_list=[]
    for i, label in enumerate(unique_labels):
      try:
        pr_list.append(average_precision_score((y_true==label)*1,pred_prob[:,i]))
        roc_list.append(roc_auc_score((y_true==label)*1,pred_prob[:,i]))
      except:
        pr_list.append(0.5)
        roc_list.append(0.5)
    pr_auc= np.nanmean(pr_list)
    roc_auc= np.nanmean(roc_list)
    prec=precision_score(y_true,pred,average="macro",zero_division=1)#precision
    results={"Accuracy":accuracy, "ROC_AUC":roc_auc,"PR_AUC":pr_auc,"Precision":prec,"TPR":TPR,"FPR":FPR}
    return results

In [6]:
def Train_Model(X,y,num_classes, label_ratio):
  cv=StratifiedKFold(n_splits=10, shuffle=True)
  res_list=pd.DataFrame()
  for train, test in cv.split(X,y):
    x_1, x_test, y_1,y_test=X.loc[train], X.loc[test], y.loc[train], y.loc[test]
    x_train,__,y_train,__=train_test_split(x_1,y_1,train_size=label_ratio)

    train_start=time.time()
    model=xgboost.XGBClassifier(**{"num_class":num_classes, "objective":'multi:softprob'})#.set_params()
    model.fit(x_train, y_train)
    pred_start=time.time()
    pred=model.predict(x_test)
    pred_end=time.time()
    pred_prob=model.predict_proba(x_test)
    res= Get_Results(pred_prob, pred, y_test, num_classes)
    res["train time [seconds]"]=pred_start-train_start
    res["inference time 1000 samples [seconds]"]=(pred_end-pred_start)*1000/len(y_test)
    res_list=res_list.append(res, ignore_index=True)
  results=pd.DataFrame(res_list.mean()).T
  return results


In [7]:
def Evaluate_Models(label_ratio=1):
  results_df=pd.DataFrame()
  for file_path in files_list:
    print(file_path)
    #load
    data = pd.read_csv(
      "/content/gdrive/MyDrive/Studying/Computational_Learning/HW4/Data/"+file_path)
    class_col=data.columns[-1] 
    X=data.drop(columns=[class_col])
    y=pd.Series(LabelEncoder().fit_transform(data[class_col]))
    num_features=len(X.columns.values)
    num_classes=y.nunique()
    #train
    results=Train_Model(X,y,num_classes,label_ratio)
    results["dataset name"]=file_path
    print(results)
    #save
    results=results[["dataset name", "Accuracy", "ROC_AUC", "PR_AUC", "Precision", "TPR", "FPR","inference time 1000 samples [seconds]","train time [seconds]"]]
    results_df=results_df.append(results, ignore_index=True)
  results_df.to_csv("/content/gdrive/MyDrive/Studying/Computational_Learning/HW4/Results/XGBoost"+str(label_ratio)+".csv")

In [9]:
Evaluate_Models(label_ratio=0.9)

arrhythmia.csv




   Accuracy       FPR  ...  train time [seconds]    dataset name
0  0.734396  0.042808  ...              3.269872  arrhythmia.csv

[1 rows x 9 columns]
mfeat-karhunen.csv
   Accuracy       FPR  ...  train time [seconds]        dataset name
0     0.947  0.005889  ...              6.438433  mfeat-karhunen.csv

[1 rows x 9 columns]
plant-margin.csv
   Accuracy       FPR  ...  train time [seconds]      dataset name
0     0.725  0.002778  ...             19.051762  plant-margin.csv

[1 rows x 9 columns]
steel-plates.csv
   Accuracy     FPR  ...  train time [seconds]      dataset name
0  0.782572  0.0422  ...              1.484479  steel-plates.csv

[1 rows x 9 columns]
bank.csv
   Accuracy       FPR  ...  train time [seconds]  dataset name
0  0.896043  0.355832  ...              0.444109      bank.csv

[1 rows x 9 columns]
molec-biol-splice.csv
   Accuracy      FPR  ...  train time [seconds]           dataset name
0  0.958621  0.02023  ...               1.10307  molec-biol-splice.csv

[1 ro



   Accuracy       FPR  ...  train time [seconds]       dataset name
0  0.887072  0.023006  ...              1.498521  low-res-spect.csv

[1 rows x 9 columns]
ozone.csv
   Accuracy       FPR  ...  train time [seconds]  dataset name
0  0.968848  0.450163  ...              1.130084     ozone.csv

[1 rows x 9 columns]
spambase.csv
   Accuracy       FPR  ...  train time [seconds]  dataset name
0  0.945451  0.059572  ...              1.270764  spambase.csv

[1 rows x 9 columns]
wine-quality-white.csv




   Accuracy      FPR  ...  train time [seconds]            dataset name
0  0.579018  0.10711  ...              1.237157  wine-quality-white.csv

[1 rows x 9 columns]


In [None]:
#file_path=files_list[3]
file_path="bank.csv"
data = pd.read_csv(
      "/content/gdrive/MyDrive/Studying/Computational_Learning/HW4/Data/"+file_path)
class_col=data.columns[-1] 
X=data.drop(columns=[class_col])
y=pd.Series(LabelEncoder().fit_transform(data[class_col]))
num_features=len(X.columns.values)
num_classes=y.nunique()
x_train, x_test,y_train,y_test=train_test_split(X,y)


model=xgboost.XGBClassifier(**{"num_class":num_classes, "objective":'multi:softprob'})#.set_params()
model.fit(x_train, y_train)
pred_start=time.time()
pred=model.predict(x_test)
pred_end=time.time()
pred_prob=model.predict_proba(x_test)

In [None]:
pred=model.predict(x_test,strict_shape=True)


TypeError: ignored