In [1]:
import numpy as np
import pandas as pd

In [2]:
def get_bcw():
    bcw = pd.read_csv('~/Downloads/Dataset/diabetes.csv' , header=None)

    column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
                    'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
    bcw.columns = column_names

    X = bcw.drop(columns=['Outcome']).values

    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler().fit(X)
    X = ss.fit_transform(X)

    y = bcw['Outcome'].values

    print(bcw)
    return X,y

In [14]:
def evaluate(pred, expect) :
    ans = pred - expect
    error_sum = ans.sum()
    n_errors = abs(ans).sum()
    accuracy = 1 - n_errors / expect.shape[0]
    
    return round(accuracy, 3) , n_errors, error_sum

In [3]:
X, y = get_bcw()

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

In [4]:
y.sum()

268

In [5]:
y.shape

(768,)

In [6]:
X_orig = X
y_orig = y

bcw = pd.DataFrame(X)
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
                    'BMI', 'DiabetesPedigreeFunction', 'Age']
bcw.columns = column_names
bcw['Outcome'] = y

In [7]:
bcw.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [8]:
cols = bcw.columns
['Outcome'] + list(cols[:-1])

['Outcome',
 'Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [9]:
from sklearn.model_selection import train_test_split

# numpy array does not have column names.  
# We can only use the slice [row_start:row_end:step_row, col_start:col_end:step_col]
# to select rows and cols we want
# be aware that row_end and col_end are not inclusive
# the format of the output will be discussed in another lecture.
print(X[0:6:2])
print(y[0:40:4])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=2018) 

[[ 0.63994726  0.84832379  0.14964075  0.90726993 -0.69289057  0.20401277
   0.46849198  1.4259954 ]
 [ 1.23388019  1.94372388 -0.26394125 -1.28821221 -0.69289057 -1.10325546
   0.60439732 -0.10558415]
 [-1.14185152  0.5040552  -1.50468724  0.90726993  0.76583594  1.4097456
   5.4849091  -0.0204964 ]]
[1 1 1 0 1 0 1 0 0 0]


In [36]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

model.fit(X_train, y_train)
pred = model.predict(X_test)

print(evaluate(pred, y_test))

(0.636, 104, 66)


In [12]:
def GetImbalancedLabelData(DF_orig, label_ratio) :
    import copy as copy
    from sklearn.model_selection import train_test_split
    DF = copy.copy(DF_orig)
    DF1 = DF[DF.Outcome == 1]
    DF0 = DF[DF.Outcome == 0]
    
    Test_Size= 0.25  # intended test size.  The actual size depend on how the train data is arranged.
    # Simple implementation : 
    #      split both DF0 with Test_Size and DF1 with label_ratio 
    #      This method can not produce the right label_ratio for training set. 
#     TestSizeFrom0 =  Test_Size
    # instead , the following 2 statement, can split the training data in the precise label_ratio.
    max_train0_size = min(DF0.shape[0], DF1.shape[0])  
    TestSizeFrom0 = 1 -  min(max_train0_size, DF0.shape[0] * (1- Test_Size) ) / DF0.shape[0]
    
    
    DF0_train, DF0_test = train_test_split(DF0, test_size = TestSizeFrom0, random_state=2018)
    DF1_train, DF1_test = train_test_split(DF1, test_size =  (1 - label_ratio), random_state=2018)
    
    DF_train = pd.concat((DF0_train, DF1_train) )
    DF_test = pd.concat((DF0_test, DF1_test) )
    
    y_train = DF_train.Outcome.values
    X_train = DF_train.drop(columns=[ 'Outcome']).values
    y_test = DF_test.Outcome.values
    X_test = DF_test.drop(columns=[ 'Outcome']).values

    return X_train, X_test, y_train, y_test

In [13]:
X_train, X_test, y_train, y_test = GetImbalancedLabelData(bcw, 0.8)
print(y_train.sum(), y_train.shape,  y_test.sum(), y_test.shape,)
print(y_train.sum()/( y_train.shape[0] - y_train.sum() )  )

214 (482,) 54 (286,)
0.7985074626865671


In [15]:
def RunAll(bcw, label_ratio, default = 1) :
    
    # default: 1 for default class_weight (or default weights if KNN)
    # default: 0 forclass_weight = balanced (or weights = 'distance' if KNN)
    
    X_train, X_test, y_train, y_test = GetImbalancedLabelData(bcw, label_ratio)    
    LabelRatio = (y_train.sum() / ( y_train.shape[0] - y_train.sum()))
    
    results = []
    
    from sklearn.tree import DecisionTreeClassifier
    if default :
        DT = DecisionTreeClassifier()
    else:
        DT = DecisionTreeClassifier(class_weight='balanced') # default:None, other: balanced
    model = DT
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['DT', default, round(LabelRatio, 2) ] + list( evaluate(pred, y_test))]
    

    from sklearn.ensemble import RandomForestClassifier
    if default :
        RF = RandomForestClassifier(n_estimators =50)
    else:
        RF = RandomForestClassifier(n_estimators =50, class_weight='balanced') # default:None, other: balanced    
    model = RF
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['RF', default, round(LabelRatio, 2) ] + list( evaluate(pred, y_test))]
    
        
    from sklearn.neighbors import KNeighborsClassifier
    if default :
        KNN = KNeighborsClassifier()
    else:
        KNN = KNeighborsClassifier(weights = 'distance') # default:uniform, other: distance    
#     KNN = KNeighborsClassifier()
    model = KNN
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['KNN', default, round(LabelRatio, 2) ] + list( evaluate(pred, y_test))]
    
        
    from sklearn.svm import SVC
    if default :
        SVM = SVC(gamma='scale')
    else:
        SVM = SVC(gamma='scale', class_weight='balanced') # default:None, other: balanced
    model = SVM
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results = results + [['SVM', default, round(LabelRatio, 2) ] + list( evaluate(pred, y_test))]
    
    
    return results


In [16]:
RunAll(bcw, 0.8, default = 1)

[['DT', 1, 0.8, 0.629, 106, 62],
 ['RF', 1, 0.8, 0.724, 79, 51],
 ['KNN', 1, 0.8, 0.748, 72, 34],
 ['SVM', 1, 0.8, 0.741, 74, 38]]

In [17]:
Default = []
for i in range(10, 100) :
    labelratio = 0.01 * i  # i: 10--99, ---> 0.1 --0.99
    Default = Default + RunAll(bcw, labelratio, 1)
    
Default 

[['DT', 1, 0.1, 0.582, 198, -170],
 ['RF', 1, 0.1, 0.511, 232, -220],
 ['KNN', 1, 0.1, 0.517, 229, -225],
 ['SVM', 1, 0.1, 0.506, 234, -234],
 ['DT', 1, 0.11, 0.563, 206, -160],
 ['RF', 1, 0.11, 0.531, 221, -215],
 ['KNN', 1, 0.11, 0.524, 224, -220],
 ['SVM', 1, 0.11, 0.518, 227, -227],
 ['DT', 1, 0.12, 0.598, 188, -134],
 ['RF', 1, 0.12, 0.543, 214, -204],
 ['KNN', 1, 0.12, 0.526, 222, -212],
 ['SVM', 1, 0.12, 0.519, 225, -223],
 ['DT', 1, 0.13, 0.599, 187, -137],
 ['RF', 1, 0.13, 0.545, 212, -204],
 ['KNN', 1, 0.13, 0.528, 220, -206],
 ['SVM', 1, 0.13, 0.526, 221, -217],
 ['DT', 1, 0.14, 0.661, 157, -109],
 ['RF', 1, 0.14, 0.559, 204, -186],
 ['KNN', 1, 0.14, 0.536, 215, -201],
 ['SVM', 1, 0.14, 0.533, 216, -210],
 ['DT', 1, 0.15, 0.628, 171, -109],
 ['RF', 1, 0.15, 0.576, 195, -179],
 ['KNN', 1, 0.15, 0.557, 204, -188],
 ['SVM', 1, 0.15, 0.546, 209, -203],
 ['DT', 1, 0.16, 0.611, 178, -114],
 ['RF', 1, 0.16, 0.59, 188, -174],
 ['KNN', 1, 0.16, 0.561, 201, -181],
 ['SVM', 1, 0.16, 0.

In [18]:
Weight = []
for i in range(10, 100) :
    labelratio = 0.01 * i  # i: 10--99, ---> 0.1 --0.99
    Weight = Weight + RunAll(bcw, labelratio, 1)
    
Weight

[['DT', 1, 0.1, 0.582, 198, -172],
 ['RF', 1, 0.1, 0.515, 230, -218],
 ['KNN', 1, 0.1, 0.517, 229, -225],
 ['SVM', 1, 0.1, 0.506, 234, -234],
 ['DT', 1, 0.11, 0.575, 200, -158],
 ['RF', 1, 0.11, 0.529, 222, -208],
 ['KNN', 1, 0.11, 0.524, 224, -220],
 ['SVM', 1, 0.11, 0.518, 227, -227],
 ['DT', 1, 0.12, 0.59, 192, -144],
 ['RF', 1, 0.12, 0.53, 220, -210],
 ['KNN', 1, 0.12, 0.526, 222, -212],
 ['SVM', 1, 0.12, 0.519, 225, -223],
 ['DT', 1, 0.13, 0.597, 188, -130],
 ['RF', 1, 0.13, 0.543, 213, -203],
 ['KNN', 1, 0.13, 0.528, 220, -206],
 ['SVM', 1, 0.13, 0.526, 221, -217],
 ['DT', 1, 0.14, 0.641, 166, -108],
 ['RF', 1, 0.14, 0.568, 200, -188],
 ['KNN', 1, 0.14, 0.536, 215, -201],
 ['SVM', 1, 0.14, 0.533, 216, -210],
 ['DT', 1, 0.15, 0.62, 175, -115],
 ['RF', 1, 0.15, 0.57, 198, -178],
 ['KNN', 1, 0.15, 0.557, 204, -188],
 ['SVM', 1, 0.15, 0.546, 209, -203],
 ['DT', 1, 0.16, 0.629, 170, -106],
 ['RF', 1, 0.16, 0.583, 191, -175],
 ['KNN', 1, 0.16, 0.561, 201, -181],
 ['SVM', 1, 0.16, 0.55,

In [19]:
DefaultDF = pd.DataFrame(Default, columns = ['Model', 'Weight', 'LabelRatio', 'Accuracy', 'N_errors', 'Bias'])
WeightDF = pd.DataFrame(Weight, columns = ['Model', 'Weight', 'LabelRatio', 'Accuracy', 'N_errors', 'Bias'])

In [21]:
WeightDF.head(4)

Unnamed: 0,Model,Weight,LabelRatio,Accuracy,N_errors,Bias
0,DT,1,0.1,0.582,198,-172
1,RF,1,0.1,0.515,230,-218
2,KNN,1,0.1,0.517,229,-225
3,SVM,1,0.1,0.506,234,-234


In [22]:
DefaultDF.Bias.sum()

-10288

In [23]:
DefaultDF.describe()

Unnamed: 0,Weight,LabelRatio,Accuracy,N_errors,Bias
count,360.0,360.0,360.0,360.0,360.0
mean,1.0,0.545,0.685239,114.044444,-28.577778
std,0.0,0.260153,0.060331,41.57938,80.365596
min,1.0,0.1,0.506,62.0,-234.0
25%,1.0,0.32,0.65375,83.0,-82.0
50%,1.0,0.545,0.6995,101.0,-11.5
75%,1.0,0.77,0.734,134.0,39.5
max,1.0,0.99,0.763,234.0,101.0


In [24]:
WeightDF.describe()

Unnamed: 0,Weight,LabelRatio,Accuracy,N_errors,Bias
count,360.0,360.0,360.0,360.0,360.0
mean,1.0,0.545,0.685081,114.097222,-28.502778
std,0.0,0.260153,0.060478,41.59205,80.460963
min,1.0,0.1,0.506,62.0,-234.0
25%,1.0,0.32,0.653,81.75,-81.25
50%,1.0,0.545,0.7,102.0,-13.0
75%,1.0,0.77,0.734,134.0,39.5
max,1.0,0.99,0.77,234.0,93.0


In [25]:
DefaultDF[DefaultDF.LabelRatio > 0.7].describe()

Unnamed: 0,Weight,LabelRatio,Accuracy,N_errors,Bias
count,116.0,116.0,116.0,116.0,116.0
mean,1.0,0.85,0.707948,79.396552,51.603448
std,0.0,0.084029,0.042801,11.766481,18.114073
min,1.0,0.71,0.57,62.0,9.0
25%,1.0,0.78,0.6745,71.0,38.0
50%,1.0,0.85,0.724,77.0,52.0
75%,1.0,0.92,0.742,86.25,63.25
max,1.0,0.99,0.759,111.0,101.0


In [26]:
WeightDF[WeightDF.LabelRatio > 0.7].describe()

Unnamed: 0,Weight,LabelRatio,Accuracy,N_errors,Bias
count,116.0,116.0,116.0,116.0,116.0
mean,1.0,0.85,0.707371,79.586207,51.62069
std,0.0,0.084029,0.043035,12.151986,18.046633
min,1.0,0.71,0.592,62.0,9.0
25%,1.0,0.78,0.67075,71.0,38.75
50%,1.0,0.85,0.7245,77.5,51.0
75%,1.0,0.92,0.742,85.5,63.0
max,1.0,0.99,0.755,116.0,93.0


In [27]:
DefaultDF[DefaultDF.LabelRatio < 0.5].describe()

Unnamed: 0,Weight,LabelRatio,Accuracy,N_errors,Bias
count,160.0,160.0,160.0,160.0,160.0
mean,1.0,0.295,0.649206,149.5625,-102.7375
std,0.0,0.115796,0.062768,37.170301,57.821835
min,1.0,0.1,0.506,94.0,-234.0
25%,1.0,0.1975,0.59975,120.75,-141.0
50%,1.0,0.295,0.6635,140.0,-94.0
75%,1.0,0.3925,0.695,178.25,-53.0
max,1.0,0.49,0.745,234.0,-1.0


In [28]:
WeightDF[WeightDF.LabelRatio < 0.5].describe()

Unnamed: 0,Weight,LabelRatio,Accuracy,N_errors,Bias
count,160.0,160.0,160.0,160.0,160.0
mean,1.0,0.295,0.649325,149.51875,-102.95625
std,0.0,0.115796,0.062905,37.225806,57.580717
min,1.0,0.1,0.506,94.0,-234.0
25%,1.0,0.1975,0.60075,119.0,-144.0
50%,1.0,0.295,0.661,141.5,-91.0
75%,1.0,0.3925,0.69525,178.25,-54.75
max,1.0,0.49,0.747,234.0,-8.0


In [29]:
DefaultDF[(DefaultDF.LabelRatio < 0.5) & (DefaultDF.LabelRatio > 0.2)].describe()

Unnamed: 0,Weight,LabelRatio,Accuracy,N_errors,Bias
count,116.0,116.0,116.0,116.0,116.0
mean,1.0,0.35,0.679621,130.956897,-75.577586
std,0.0,0.084029,0.038542,21.98513,36.915686
min,1.0,0.21,0.586,94.0,-159.0
25%,1.0,0.28,0.65475,115.5,-99.25
50%,1.0,0.35,0.6825,128.5,-70.5
75%,1.0,0.42,0.70725,144.25,-50.0
max,1.0,0.49,0.745,184.0,-1.0


In [30]:
WeightDF[(WeightDF.LabelRatio < 0.5) & (WeightDF.LabelRatio > 0.2)].describe()

Unnamed: 0,Weight,LabelRatio,Accuracy,N_errors,Bias
count,116.0,116.0,116.0,116.0,116.0
mean,1.0,0.35,0.679793,130.887931,-75.922414
std,0.0,0.084029,0.038532,21.969454,36.549466
min,1.0,0.21,0.586,94.0,-159.0
25%,1.0,0.28,0.65575,113.75,-100.5
50%,1.0,0.35,0.682,128.0,-70.5
75%,1.0,0.42,0.7075,144.25,-51.0
max,1.0,0.49,0.747,184.0,-8.0


In [31]:
DefaultDF.groupby('Model').sum()

Unnamed: 0_level_0,Weight,LabelRatio,Accuracy,N_errors,Bias
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DT,90,49.05,58.859,11042,-122
KNN,90,49.05,62.019,10297,-3715
RF,90,49.05,62.747,9865,-2645
SVM,90,49.05,63.061,9852,-3806


In [32]:
WeightDF.groupby('Model').sum()

Unnamed: 0_level_0,Weight,LabelRatio,Accuracy,N_errors,Bias
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DT,90,49.05,58.754,11078,-78
KNN,90,49.05,62.019,10297,-3715
RF,90,49.05,62.795,9848,-2662
SVM,90,49.05,63.061,9852,-3806


In [33]:
DefaultDF[DefaultDF.LabelRatio < 0.5].groupby('Model').sum()

Unnamed: 0_level_0,Weight,LabelRatio,Accuracy,N_errors,Bias
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DT,40,11.8,26.074,5899,-2717
KNN,40,11.8,25.165,6328,-4714
RF,40,11.8,26.561,5744,-4264
SVM,40,11.8,26.073,5959,-4743


In [34]:
WeightDF[WeightDF.LabelRatio < 0.5].groupby('Model').sum()

Unnamed: 0_level_0,Weight,LabelRatio,Accuracy,N_errors,Bias
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DT,40,11.8,26.097,5891,-2737
KNN,40,11.8,25.165,6328,-4714
RF,40,11.8,26.557,5745,-4279
SVM,40,11.8,26.073,5959,-4743
