In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score

# 1. Some subroutines

In [12]:
def get_banking(non_numerical=0):
    # https://www.kaggle.com/code/rashmiranu/banking-dataset-eda-and-binary-classification/notebook
    mydata = pd.read_csv('~/Downloads/new_train.csv')
    
    mydata['y'] = mydata['y'].map(lambda x: 0 if x == "no" else 1)
    mydata.rename({'y': 'label'}, axis = 1, inplace=True)
    
    if(non_numerical == 1):
        mydata = pd.get_dummies(mydata)
    else : 
        mydata = mydata[['age',  'duration', 'campaign', 'pdays', 'previous', 'label']]   
     
    X = mydata.drop(columns=['label']).values

    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    X = ss.fit_transform(X)
    
    
    y = mydata['label'].values
    return X, y

In [13]:
get_banking(1)

(array([[ 0.86373877, -0.12019627,  0.52298128, ..., -0.34081462,
          0.39944711, -0.18627755],
        [-0.28972159, -0.2167318 , -0.20368791, ...,  2.93414647,
         -2.50346033, -0.18627755],
        [ 3.65126795,  3.43617293, -0.56702251, ..., -0.34081462,
          0.39944711, -0.18627755],
        ...,
        [ 1.34434725, -0.49089273,  0.52298128, ..., -0.34081462,
          0.39944711, -0.18627755],
        [-1.05869515, -0.3596044 , -0.56702251, ..., -0.34081462,
          0.39944711, -0.18627755],
        [-0.48196498,  1.10387435,  0.15964669, ..., -0.34081462,
          0.39944711, -0.18627755]]),
 array([0, 0, 1, ..., 0, 0, 0], dtype=int64))

In [14]:
def evaluate(pred, expect) :
    ans = pred - expect
    error_sum = ans.sum()
    n_errors = abs(ans).sum()
    accuracy = 1 - n_errors / expect.shape[0]
    
    return round(accuracy, 3) , n_errors, error_sum

In [15]:
def evaluateConfusion(expect, pred) :
    
    ConfusionMatrix = confusion_matrix(expect, pred)
    f1 = f1_score(expect, pred)
    accuracy = accuracy_score(expect, pred)
    recall = recall_score(expect, pred)
    precision = precision_score(expect, pred)
    
    return f1, accuracy, recall, precision
    
#     return f1, accuracy, recall, precision, ConfusionMatrix

In [16]:
X, y = get_banking(1) # 0: before tranfer 1: after transfer
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [17]:
# def runSub(default=1):
#     get_datatype(1)   
#     from sklearn.svm import SVC
#     results = []
#     if default :
#         SVM = SVC(gamma='scale')
#     else:
#         SVM = SVC(gamma='scale', class_weight='balanced') # default:None, other: balanced
#     model = SVM
#     model.fit(X_train, y_train)
#     pred = model.predict(X_test)
#     results = results + [['SVM']  + list(evaluateConfusion(y_test, pred))]
#     print(results)
#     resultsDF = pd.DataFrame(results, columns = ['method', 'f1', 'accuracy', 'recall',  'precision'])
#     return resultsDF

In [18]:
# runSub(default=1)

In [19]:
# runSub(default=0)

In [20]:
#Accuracy = (TN+TP)/ALL
#Recall = TP/(TP+FN)
#Precision = TP/(TP+FP)
#F1 Score = 2*(Precision*Recall)/(Precision+Recall)

# 4. Collect results with Accuracy and Bias

In [21]:
def RunAll(X_train, X_test, y_train, y_test, default = 1) :
    
    # default: 1 for default class_weight (or default weights if KNN)
    # default: 0 forclass_weight = balanced (or weights = 'distance' if KNN)
        
    results = []
    
    from sklearn.tree import DecisionTreeClassifier
    if default :
        DT = DecisionTreeClassifier()
    else:
        DT = DecisionTreeClassifier(class_weight='balanced') # default:None, other: balanced
    model = DT
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['DT', default ] + list( evaluate(pred, y_test))]
    

    from sklearn.ensemble import RandomForestClassifier
    if default :
        RF = RandomForestClassifier(n_estimators =50)
    else:
        RF = RandomForestClassifier(n_estimators =50, class_weight='balanced') # default:None, other: balanced    
    model = RF
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['RF', default] + list( evaluate(pred, y_test))]
    
        
    from sklearn.neighbors import KNeighborsClassifier
    if default :
        KNN = KNeighborsClassifier()
    else:
        KNN = KNeighborsClassifier(weights = 'distance') # default:uniform, other: distance    
    model = KNN
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['KNN', default] + list( evaluate(pred, y_test))]
    
        
    from sklearn.svm import SVC
    if default :
        SVM = SVC(gamma='scale')
    else:
        SVM = SVC(gamma='scale', class_weight='balanced') # default:None, other: balanced
    model = SVM
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['SVM', default ] + list( evaluate(pred, y_test))]
    
    from sklearn.linear_model import LogisticRegression
    if default :
        LogR = LogisticRegression(solver='lbfgs')
    else:
        LogR = LogisticRegression(solver='lbfgs', class_weight='balanced') # default:None, other: balanced
    model = LogR
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['LogR', default] + list( evaluate(pred, y_test))]


    return results


In [22]:
Default = RunAll(X_train, X_test, y_train, y_test, default = 1)

In [24]:
DefaultDF = pd.DataFrame(Default, columns = ['model', 'weight', 'accuracy', 'N_erros', 'bias'])
DefaultDF 

Unnamed: 0,model,weight,accuracy,N_erros,bias
0,DT,1,0.877,1016,2
1,RF,1,0.905,782,-414
2,KNN,1,0.892,892,-540
3,SVM,1,0.903,802,-526
4,LogR,1,0.907,763,-453


In [25]:
Weight = RunAll(X_train, X_test, y_train, y_test, default = 0)
WeightDF =  pd.DataFrame(Weight, columns = ['model', 'weight', 'accuracy', 'N_erros', 'bias'])
WeightDF

Unnamed: 0,model,weight,accuracy,N_erros,bias
0,DT,0,0.876,1023,-55
1,RF,0,0.902,808,-508
2,KNN,0,0.893,884,-502
3,SVM,0,0.857,1180,870
4,LogR,0,0.856,1190,860


## 5. Confusion Matrix
-----


In [26]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, \
precision_score

In [27]:
mydict = {1: 'No_Weight', 0: 'Weight'}
mydict[0]

'Weight'

In [28]:
def RunAllConfusion(X_train, X_test, y_train, y_test, Feature, default = 1) :
    
    # default: 1 for default class_weight (or default weights if KNN)
    # default: 0 forclass_weight = balanced (or weights = 'distance' if KNN)
    
    mydict = {1: 'No_Weight', 0: 'Weight'}
        
    results = []
    
    from sklearn.tree import DecisionTreeClassifier
    if default :
        DT = DecisionTreeClassifier()
    else:
        DT = DecisionTreeClassifier(class_weight='balanced') # default:None, other: balanced
    model = DT
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['DT' ] + [mydict[default], Feature ] + list(evaluateConfusion(y_test, pred))]
    

    from sklearn.ensemble import RandomForestClassifier
    if default :
        RF = RandomForestClassifier(n_estimators =50)
    else:
        RF = RandomForestClassifier(n_estimators =50, class_weight='balanced') # default:None, other: balanced    
    model = RF
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['RF']  + [mydict[default], Feature ] + list(evaluateConfusion(y_test, pred))]
    
        
    from sklearn.neighbors import KNeighborsClassifier
    if default :
        KNN = KNeighborsClassifier()
    else:
        KNN = KNeighborsClassifier(weights = 'distance') # default:uniform, other: distance    
    model = KNN
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['KNN']  + [mydict[default], Feature ] + list(evaluateConfusion(y_test, pred))]
    
        
    from sklearn.svm import SVC
    if default :
        SVM = SVC(gamma='scale')
    else:
        SVM = SVC(gamma='scale', class_weight='balanced') # default:None, other: balanced
    model = SVM
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['SVM']  + [mydict[default], Feature ] + list(evaluateConfusion(y_test, pred))]
    
    from sklearn.linear_model import LogisticRegression
    if default :
        LogR = LogisticRegression(solver='lbfgs')
    else:
        LogR = LogisticRegression(solver='lbfgs', class_weight='balanced') # default:None, other: balanced
    model = LogR
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['LogR']  + [mydict[default], Feature ] + list(evaluateConfusion(y_test, pred))]


    return results


In [29]:
X, y = get_banking(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

DefaultPF = RunAllConfusion(X_train, X_test, y_train, y_test, 'PartialFeature' , default = 1)
WeightPF = RunAllConfusion(X_train, X_test, y_train, y_test , 'PartialFeature', default = 0)


X, y = get_banking(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) 

DefaultFF = RunAllConfusion(X_train, X_test, y_train, y_test , 'FullFeature', default = 1)
WeightFF = RunAllConfusion(X_train, X_test, y_train, y_test, 'FullFeature', default = 0)


In [30]:
ResultsDF = pd.concat([pd.DataFrame(WeightFF), pd.DataFrame(WeightPF),  \
           pd.DataFrame(DefaultFF), pd.DataFrame(DefaultPF)])
ResultsDF.columns = ['model', 'ClassWeight', 'FullOrPartFeatures', 'F1', 'Accuracy', 'Recall', 'Precision']
ResultsDF

Unnamed: 0,model,ClassWeight,FullOrPartFeatures,F1,Accuracy,Recall,Precision
0,DT,Weight,FullFeature,0.432827,0.877519,0.441008,0.424945
1,RF,Weight,FullFeature,0.417417,0.905802,0.318442,0.605664
2,KNN,Weight,FullFeature,0.37936,0.896334,0.298969,0.518887
3,SVM,Weight,FullFeature,0.537514,0.849599,0.824742,0.398671
4,LogR,Weight,FullFeature,0.52823,0.848871,0.798396,0.394677
0,DT,Weight,PartialFeature,0.376623,0.86016,0.366316,0.387528
1,RF,Weight,PartialFeature,0.45079,0.890265,0.390526,0.533046
2,KNN,Weight,PartialFeature,0.448499,0.886259,0.401053,0.508678
3,SVM,Weight,PartialFeature,0.511547,0.830541,0.769474,0.383124
4,LogR,Weight,PartialFeature,0.503759,0.839767,0.705263,0.391813


In [31]:
ResultsDF.sort_values('Recall', ascending = False)

Unnamed: 0,model,ClassWeight,FullOrPartFeatures,F1,Accuracy,Recall,Precision
3,SVM,Weight,FullFeature,0.537514,0.849599,0.824742,0.398671
4,LogR,Weight,FullFeature,0.52823,0.848871,0.798396,0.394677
3,SVM,Weight,PartialFeature,0.511547,0.830541,0.769474,0.383124
4,LogR,Weight,PartialFeature,0.503759,0.839767,0.705263,0.391813
0,DT,No_Weight,FullFeature,0.457766,0.879218,0.4811,0.43659
0,DT,Weight,FullFeature,0.432827,0.877519,0.441008,0.424945
2,KNN,Weight,PartialFeature,0.448499,0.886259,0.401053,0.508678
1,RF,No_Weight,PartialFeature,0.455927,0.891357,0.394737,0.539568
1,RF,No_Weight,FullFeature,0.481119,0.90993,0.394044,0.617594
1,RF,Weight,PartialFeature,0.45079,0.890265,0.390526,0.533046


In [None]:
DefaultPF[4]

In [None]:
['LogR', 'Weight', 'FF',
 0.42351168048229093,
  0.9071376547705754,
  0.3118756936736959,
  0.6596244131455399]

In [None]:
DefaultFF[4]

In [None]:
WeightPF[4]

In [None]:
WeightFF[4]

In [None]:
print((7188+296)/(7188+296+141+613))  # accuracy

In [None]:
print(296/(296+141))  # precision

In [None]:
print(296/ (296 + 613) )  # recall or sensitivity

In [None]:
2*(0.3256325632563256*0.6773455377574371)/(0.3256325632563256+0.6773455377574371)  # f1

In [None]:
Weight = RunAllConfusion(X_train, X_test, y_train, y_test, FullFeature, default = 0)
Weight
# WeightDF =  pd.DataFrame(Weight, columns = ['model', 'weight', 'accuracy', 'N_erros', 'bias'])
# WeightDF

In [None]:
2*(0.7084708470847084*0.3824228028503563)/(0.7084708470847084+0.3824228028503563)

In [None]:
Weight[1:3][1]

In [None]:
WeightDF  = pd.DataFrame(Weight)
print( WeightDF.iloc[:, 0], '\n', WeightDF.iloc[:, 1])

In [None]:
MarkerSizeScale = 20
Markersizearray =np.array([1, 2, 4, 8, 16, 32])
Y_Markersizearray = list(range(40, 24, -3))
for i in ['DT', 'RF', 'KNN', 'SVM', 'LogR' ]:
    plt.figure(figsize=(16,9))
    
    R1 = DefaultDF[DefaultDF.Model == i]
    plt.scatter(R1.LabelRatio, R1.Bias.abs(), s=R1.N_errors*MarkerSizeScale, alpha=0.5)
    
    R0 = WeightDF[WeightDF.Model == i]    
    plt.scatter(R0.LabelRatio, R0.Bias.abs(), s=R0.N_errors*MarkerSizeScale, alpha=0.5)
    
    plt.legend(['class weight = None', 'class weight =  balanced'], fontsize=16)
    
    
    plt.title('model ' + i + ' :The circle size indicates the error count.',  fontsize=30)
    plt.xlabel('Label Ratio.  Well balanced ratio is equal to 1.', fontsize=24)
    plt.ylabel('bias.  0 means no bias', fontsize=24)
    
    plt.show()
