In [1]:

from __future__ import print_function, division
from builtins import range, input
import random
# Note: you may need to update your version of future
# sudo pip install -U future
from sklearn.model_selection import train_test_split
import statistics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn import datasets
weight_list=[]

class AdaBoost:
  def __init__(self, M):
    self.M = M

  def fit(self, X, Y):
    self.models = []
    self.alphas = []

    N, _ = X.shape
    W = np.ones(N) / N

    for m in range(self.M):
      tree = DecisionTreeClassifier(max_depth=1)
      tree.fit(X, Y, sample_weight=W)
      P = tree.predict(X)

      err = W.dot(P != Y)
      alpha = 0.5*(np.log(1 - err) - np.log(err))

      W = W*np.exp(-alpha*Y*P) # vectorized form
      W = W / W.sum() # normalize so it sums to 1
      weight_list.append(W)

      self.models.append(tree)
      self.alphas.append(alpha)

  def predict(self, X):
    # NOT like SKLearn API
    # we want accuracy and exponential loss for plotting purposes
    N, _ = X.shape
    FX = np.zeros(N)
    for alpha, tree in zip(self.alphas, self.models):
      FX += alpha*tree.predict(X)
    return np.sign(FX), FX

  def score(self, X, Y):
    # NOT like SKLearn API
    # we want accuracy and exponential loss for plotting purposes
    P, FX = self.predict(X)
    L = np.exp(-Y*FX).mean()
    return np.mean(P == Y), L


if __name__ == '__main__':
    
    
    cancer = datasets.load_breast_cancer()
    Y=cancer.target
    Y[Y == 0] = -1
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(cancer.data, Y, test_size=0.2,random_state=109) # 70% training and 30% test
   # make the targets -1,+1
    per=(Ytrain.shape[0])*(40/100) #20% data will be mislabeled

    k=random.sample(range(Ytrain.shape[0]), int(per)) #list of index no. of mislabeled data
    for i in k:
        if Ytrain[i-1]==1:
            Ytrain[i-1]=-1;
        else:
            Ytrain[i-1]=1
#mislabeled data created here till now
    model = AdaBoost(10)
    model.fit(Xtrain, Ytrain)

In [2]:
weight_list=np.array(weight_list)


In [3]:
#observing the wieght of instances
#print(wieght_list)
a=len(weight_list)

#sorting data and finding mean
nm=[] 
nm=np.mean(weight_list, axis=0)
#print(nm)
a=len(nm)
Q=[]
x=[]
#print(a)
for i in range(a):
    x=[i,nm[i]]
    Q.append(x)
    x=[]
#print(Q)
Q.sort(key = lambda Q: Q[1]) 

In [4]:
from scipy import stats
yp=np.array(nm)
mode=stats.mode(nm)
print(mode)

ModeResult(mode=array([0.00174051]), count=array([114]))


In [6]:
from collections import Counter
count=Counter(nm)
count1=Counter(nm)
# print(type(count))
nm2=nm



In [7]:
print(count1)
modeg=statistics.mode(nm)
jk=float(count1[0])
print(jk)
ne=[]
for j in range(Xtrain.shape[0]):
    if nm2[j]==modeg:
        ne.append(j)



Counter({0.0017405076631426177: 114, 0.001909131424760252: 96, 0.0029985228027307907: 63, 0.002733678702562982: 59, 0.0024079687657795707: 43, 0.00217412633693691: 35, 0.0016927036880518022: 14, 0.0016547513187170897: 7, 0.0016921871945269073: 3, 0.0020645430515224905: 3, 0.002658596752985317: 2, 0.0021130402834045113: 2, 0.002193402072433937: 2, 0.0021413187726365976: 2, 0.0018049413281136274: 1, 0.002479384844207926: 1, 0.00308978145604698: 1, 0.0020108203532915358: 1, 0.0025989879469115874: 1, 0.0017924887299728218: 1, 0.0019672349558284732: 1, 0.002616338573795083: 1, 0.0028153212821042063: 1, 0.0025351340451023156: 1})
0.0


In [8]:
print(ne)

[2, 11, 15, 19, 20, 22, 24, 36, 45, 50, 54, 56, 60, 63, 68, 69, 71, 74, 78, 87, 88, 89, 90, 94, 96, 104, 106, 116, 118, 120, 125, 126, 129, 132, 134, 136, 137, 142, 143, 146, 147, 151, 158, 173, 174, 177, 186, 188, 190, 192, 199, 206, 211, 214, 221, 227, 229, 230, 234, 239, 240, 242, 244, 254, 257, 264, 267, 268, 270, 273, 281, 284, 289, 291, 295, 296, 299, 303, 306, 313, 315, 318, 320, 323, 329, 334, 339, 347, 348, 354, 357, 359, 361, 374, 380, 383, 385, 386, 398, 402, 403, 406, 407, 412, 414, 417, 418, 419, 423, 429, 430, 436, 439, 453]


In [9]:
def common_member(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    
    if (a_set & b_set): 
        wq=a_set & b_set 
    else: 
        print("No common elements")  
    return wq

In [10]:
a=set(ne)
b=set(k)
re=a&b
print(len(ne))
print(len(k))
print(len(re))

114
182
39


In [11]:
for i in ne:
    if(Ytrain[i]==1):
        Ytrain[i]=-1
    else:
        Ytrain[i]=-1

In [12]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [13]:

clf = SVC(kernel='linear') 

clf1 = KNeighborsClassifier() 

clf2 = DecisionTreeClassifier() 

clf3 = GaussianNB() 


# cancer=datasets.load_breast_cancer()

In [14]:
clf.fit(Xtrain, Ytrain)
y_pred_mis = clf.predict(Xtest)
clf1.fit(Xtrain, Ytrain)
y_pred_mis1 = clf1.predict(Xtest)
clf2.fit(Xtrain, Ytrain)
y_pred_mis2 = clf2.predict(Xtest)
clf3.fit(Xtrain, Ytrain)
y_pred_mis3 = clf3.predict(Xtest)


print(accuracy_score(Ytest, y_pred_mis))
print(accuracy_score(Ytest, y_pred_mis1))
print(accuracy_score(Ytest, y_pred_mis2))
print(accuracy_score(Ytest, y_pred_mis3))

0.3508771929824561
0.3508771929824561
0.34210526315789475
0.15789473684210525


In [18]:
for i in ne:
    if(Ytrain[i]==1):
        Ytrain[i]=-1
    else:
        Ytrain[i]=-1

In [21]:
clf.fit(Xtrain, Ytrain)
y_pred_mis = clf.predict(Xtest)
clf1.fit(Xtrain, Ytrain)
y_pred_mis1 = clf1.predict(Xtest)
clf2.fit(Xtrain, Ytrain)
y_pred_mis2 = clf2.predict(Xtest)
clf3.fit(Xtrain, Ytrain)
y_pred_mis3 = clf3.predict(Xtest)

In [22]:
print(accuracy_score(Ytest, y_pred_mis))
print(accuracy_score(Ytest, y_pred_mis1))
print(accuracy_score(Ytest, y_pred_mis2))
print(accuracy_score(Ytest, y_pred_mis3))

0.9736842105263158
0.9473684210526315
0.8421052631578947
0.9473684210526315
