In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, \
precision_score

# 1. Some subroutines

In [None]:
def get_banking():
    # https://www.kaggle.com/code/rashmiranu/banking-dataset-eda-and-binary-classification/notebook
    mydata = pd.read_csv('/Users/nengkuantu/Downloads/new_train.csv')
    mydata = mydata[['age',  'duration', 'campaign', 'pdays', 'previous', 'y']]
    mydata['y'] = mydata['y'].map(lambda x: 0 if x == "no" else 1)
    mydata.rename({'y': 'label'}, axis = 1, inplace=True)
    
    
    X = mydata.drop(columns=['label']).values

    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    X = ss.fit_transform(X)
    
    
    y = mydata['label'].values
    return X, y

In [None]:
def get_bcw() :
    bcw = pd.read_csv('~/Downloads/wdbc.data' , header=None)
    column_names = ['id','malignant',
                'nucleus_mean','nucleus_se','nucleus_worst',
                'texture_mean','texture_se','texture_worst',
                'perimeter_mean','perimeter_se','perimeter_worst',
                'area_mean','area_se','area_worst',
                'smoothness_mean','smoothness_se','smoothness_worst',
                'compactness_mean','compactness_se','compactness_worst',
                'concavity_mean','concavity_se','concavity_worst',
                'concave_pts_mean','concave_pts_se','concave_pts_worst',
                'symmetry_mean','symmetry_se','symmetry_worst',
                'fractal_dim_mean','fractal_dim_se','fractal_dim_worst']

    bcw.columns = column_names
    
    bcw['malignant'] = bcw['malignant'].map(lambda x: 0 if x == "B" else 1)

    # make a copy for two purposes:
    # 1. keep the original data intake for futural reference.
    # 2. use the same dataset name "mydata" for later processing
    # so that we use the same code for different dataset.

    #     X = mydata[['nucleus_mean','texture_mean','perimeter_mean']]
    X = bcw.drop(columns=['id', 'malignant']).values

    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    X = ss.fit_transform(X)
    
    
    y = bcw['malignant'].values
    return X, y

In [None]:
def evaluate(pred, expect) :
    ans = pred - expect
    error_sum = ans.sum()
    n_errors = abs(ans).sum()
    accuracy = 1 - n_errors / expect.shape[0]
    
    return round(accuracy, 3) , n_errors, error_sum

In [None]:
def evaluateConfusion(expect, pred) :
    
    ConfusionMatrix = confusion_matrix(expect, pred)
    f1 = f1_score(expect, pred)
    accuracy = accuracy_score(expect, pred)
    recall = recall_score(expect, pred)
    precision = precision_score(expect, pred)
    
    return f1, accuracy, recall, precision, ConfusionMatrix

# 2. get dataset

In [None]:
X, y = get_banking()   # X: feature vector, y label

# 3. Split Data into Training and Testing
--------
Choose 3.1  and 3.2 to get the type of split data you want to train.

# 3.1 Original Data Split

In [None]:
from sklearn.model_selection import train_test_split

print(X.shape)
print(y.shape)
print(type(X))
print(type(y))
# numpy array does not have column names.  
# We can only use the slice [row_start:row_end:step_row, col_start:col_end:step_col]
# to select rows and cols we want
# be aware that row_end and col_end are not inclusive
# the format of the output will be discussed in another lecture.
print(X[0:6:2])
print(y[0:40:4])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=2018) 


(32950, 5)
(32950,)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[[ 0.86373877 -0.12019627  0.52298128  0.19658384 -0.35012691]
 [ 3.65126795  3.43617293 -0.56702251  0.19658384 -0.35012691]
 [ 1.82495573  0.42426417 -0.20368791  0.19658384 -0.35012691]]
[0 0 0 0 0 0 1 0 0 0]


# 3.2 Data Split to Get Imbalanced Labels in Training Data

In [None]:
def GetImbalancedLabelData(DF_orig, label_ratio) :
    import copy as copy
    from sklearn.model_selection import train_test_split
    DF = copy.copy(DF_orig)
    DF1 = DF[DF.malignant == 1]
    DF0 = DF[DF.malignant == 0]
    
    Test_Size= 0.25  # intended test size.  The actual size depend on how the train data is arranged.
    # Simple implementation : 
    #      split both DF0 with Test_Size and DF1 with label_ratio 
    #      This method can not produce the right label_ratio for training set. 
#     TestSizeFrom0 =  Test_Size
    # instead , the following 2 statement, can split the training data in the precise label_ratio.
    max_train0_size = min(DF0.shape[0], DF1.shape[0])  
    TestSizeFrom0 = 1 -  min(max_train0_size, DF0.shape[0] * (1- Test_Size) ) / DF0.shape[0]
    
    
    DF0_train, DF0_test = train_test_split(DF0, test_size = TestSizeFrom0, random_state=2018)
    DF1_train, DF1_test = train_test_split(DF1, test_size =  (1 - label_ratio), random_state=2018)
    
    DF_train = pd.concat((DF0_train, DF1_train) )
    DF_test = pd.concat((DF0_test, DF1_test) )
    
    y_train = DF_train.malignant.values
    X_train = DF_train.drop(columns=[ 'malignant']).values
    y_test = DF_test.malignant.values
    X_test = DF_test.drop(columns=[ 'malignant']).values

    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = GetImbalancedLabelData(bcw, 0.8)
print(y_train.sum(), y_train.shape,  y_test.sum(), y_test.shape,)
print(y_train.sum()/( y_train.shape[0] - y_train.sum() )  )

# 4. Collect results with various labelratio

In [None]:
def RunAll(X_train, X_test, y_train, y_test, default = 1) :
    
    # default: 1 for default class_weight (or default weights if KNN)
    # default: 0 forclass_weight = balanced (or weights = 'distance' if KNN)
        
    results = []
    
    from sklearn.tree import DecisionTreeClassifier
    if default :
        DT = DecisionTreeClassifier()
    else:
        DT = DecisionTreeClassifier(class_weight='balanced') # default:None, other: balanced
    model = DT
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['DT', default ] + list( evaluate(pred, y_test))]
    

    from sklearn.ensemble import RandomForestClassifier
    if default :
        RF = RandomForestClassifier(n_estimators =50)
    else:
        RF = RandomForestClassifier(n_estimators =50, class_weight='balanced') # default:None, other: balanced    
    model = RF
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['RF', default] + list( evaluate(pred, y_test))]
    
        
    from sklearn.neighbors import KNeighborsClassifier
    if default :
        KNN = KNeighborsClassifier()
    else:
        KNN = KNeighborsClassifier(weights = 'distance') # default:uniform, other: distance    
    model = KNN
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['KNN', default] + list( evaluate(pred, y_test))]
    
        
    from sklearn.svm import SVC
    if default :
        SVM = SVC(gamma='scale')
    else:
        SVM = SVC(gamma='scale', class_weight='balanced') # default:None, other: balanced
    model = SVM
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['SVM', default ] + list( evaluate(pred, y_test))]
    
    from sklearn.linear_model import LogisticRegression
    if default :
        LogR = LogisticRegression(solver='lbfgs')
    else:
        LogR = LogisticRegression(solver='lbfgs', class_weight='balanced') # default:None, other: balanced
    model = LogR
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['LogR', default] + list( evaluate(pred, y_test))]


    return results


In [None]:
Default = RunAll(X_train, X_test, y_train, y_test, default = 1)

In [None]:
Default

[['DT', 1, 0.869, 1076, -14],
 ['RF', 1, 0.893, 878, -240],
 ['KNN', 1, 0.901, 819, -329],
 ['SVM', 1, 0.907, 764, -478],
 ['LogR', 1, 0.908, 754, -472]]

In [None]:
DefaultDF = pd.DataFrame(Default, columns = ['model', 'weight', 'accuracy', 'N_erros', 'bias'])
DefaultDF 

Unnamed: 0,model,weight,accuracy,N_erros,bias
0,DT,1,0.869,1076,-14
1,RF,1,0.893,878,-240
2,KNN,1,0.901,819,-329
3,SVM,1,0.907,764,-478
4,LogR,1,0.908,754,-472


In [None]:
Weight = RunAll(X_train, X_test, y_train, y_test, default = 0)
WeightDF =  pd.DataFrame(Weight, columns = ['model', 'weight', 'accuracy', 'N_erros', 'bias'])
WeightDF

Unnamed: 0,model,weight,accuracy,N_erros,bias
0,DT,0,0.865,1112,8
1,RF,0,0.89,910,-234
2,KNN,0,0.889,915,-205
3,SVM,0,0.829,1412,994
4,LogR,0,0.842,1305,775


## 5. Confusion Matrix
-----
![image.png](attachment:image.png)


In [None]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, \
precision_score

In [None]:
def RunAllConfusion(X_train, X_test, y_train, y_test, default = 1) :
    
    # default: 1 for default class_weight (or default weights if KNN)
    # default: 0 forclass_weight = balanced (or weights = 'distance' if KNN)
        
    results = []
    
    from sklearn.tree import DecisionTreeClassifier
    if default :
        DT = DecisionTreeClassifier()
    else:
        DT = DecisionTreeClassifier(class_weight='balanced') # default:None, other: balanced
    model = DT
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['DT' ] + [evaluateConfusion(y_test, pred)]]
    

    from sklearn.ensemble import RandomForestClassifier
    if default :
        RF = RandomForestClassifier(n_estimators =50)
    else:
        RF = RandomForestClassifier(n_estimators =50, class_weight='balanced') # default:None, other: balanced    
    model = RF
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['RF']  + [evaluateConfusion(y_test, pred)]]
    
        
    from sklearn.neighbors import KNeighborsClassifier
    if default :
        KNN = KNeighborsClassifier()
    else:
        KNN = KNeighborsClassifier(weights = 'distance') # default:uniform, other: distance    
    model = KNN
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['KNN']  + [evaluateConfusion(y_test, pred)]]
    
        
    from sklearn.svm import SVC
    if default :
        SVM = SVC(gamma='scale')
    else:
        SVM = SVC(gamma='scale', class_weight='balanced') # default:None, other: balanced
    model = SVM
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['SVM']  + [evaluateConfusion(y_test, pred)]]
    
    from sklearn.linear_model import LogisticRegression
    if default :
        LogR = LogisticRegression(solver='lbfgs')
    else:
        LogR = LogisticRegression(solver='lbfgs', class_weight='balanced') # default:None, other: balanced
    model = LogR
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['LogR']  + [evaluateConfusion(y_test, pred)]]


    return results


In [None]:
Default = RunAllConfusion(X_train, X_test, y_train, y_test, default = 1)
Default


[['DT',
  (0.3982251802551304,
   0.8682932750667638,
   0.3949394939493949,
   0.4015659955257271,
   array([[6794,  535],
          [ 550,  359]]))],
 ['RF',
  (0.44529262086513993,
   0.8941490653071134,
   0.385038503850385,
   0.5279034690799397,
   array([[7016,  313],
          [ 559,  350]]))],
 ['KNN',
  (0.44996642041638685,
   0.9005826656955572,
   0.36853685368536854,
   0.5775862068965517,
   array([[7084,  245],
          [ 574,  335]]))],
 ['SVM',
  (0.42985074626865666,
   0.9072590434571498,
   0.31683168316831684,
   0.6682134570765661,
   array([[7186,  143],
          [ 621,  288]]))],
 ['LogR',
  (0.4398216939078752,
   0.9084729303228939,
   0.3256325632563256,
   0.6773455377574371,
   array([[7188,  141],
          [ 613,  296]]))]]

In [None]:
print((7188+296)/(7188+296+141+613))  # accuracy

0.9084729303228939


In [None]:
print(296/(296+141))  # precision

0.6773455377574371


In [None]:
print(296/ (296 + 613) )  # recall or sensitivity

0.3256325632563256


In [None]:
2*(0.3256325632563256*0.6773455377574371)/(0.3256325632563256+0.6773455377574371)  # f1

0.4398216939078752

In [None]:
Weight = RunAllConfusion(X_train, X_test, y_train, y_test, default = 0)
Weight
# WeightDF =  pd.DataFrame(Weight, columns = ['model', 'weight', 'accuracy', 'N_erros', 'bias'])
# WeightDF

[['DT',
  (0.39113573407202223,
   0.8665938334547221,
   0.38833883388338836,
   0.3939732142857143,
   array([[6786,  543],
          [ 556,  353]]))],
 ['RF',
  (0.4249363867684478,
   0.8902646273367322,
   0.36743674367436746,
   0.5037707390648567,
   array([[7000,  329],
          [ 575,  334]]))],
 ['KNN',
  (0.43273403595784254,
   0.8889293517844137,
   0.38393839383938394,
   0.49573863636363635,
   array([[6974,  355],
          [ 560,  349]]))],
 ['SVM',
  (0.4978662873399715,
   0.8285991745569313,
   0.77007700770077,
   0.3678402522333158,
   array([[6126, 1203],
          [ 209,  700]]))],
 ['LogR',
  (0.49672194369456224,
   0.8415877640203933,
   0.7084708470847084,
   0.3824228028503563,
   array([[6289, 1040],
          [ 265,  644]]))]]

In [None]:
2*(0.7084708470847084*0.3824228028503563)/(0.7084708470847084+0.3824228028503563)

0.49672194369456224

In [None]:
Weight[1:3][1]

['KNN', array([[6974,  560],
        [ 355,  349]])]

In [None]:
WeightDF  = pd.DataFrame(Weight)
print( WeightDF.iloc[:, 0], '\n', WeightDF.iloc[:, 1])

0      DT
1      RF
2     KNN
3     SVM
4    LogR
Name: 0, dtype: object 
 0     [[6783, 558], [546, 351]]
1     [[6989, 559], [340, 350]]
2     [[6974, 560], [355, 349]]
3    [[6126, 209], [1203, 700]]
4    [[6289, 265], [1040, 644]]
Name: 1, dtype: object
