## adaboost code start from here

In [None]:

from __future__ import print_function, division
from builtins import range, input
import random
# Note: you may need to update your version of future
# sudo pip install -U future
from sklearn.model_selection import train_test_split
import statistics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn import datasets
weight_list=[]

class AdaBoost:
  def __init__(self, M):
    self.M = M

  def fit(self, X, Y):
    self.models = []
    self.alphas = []

    N, _ = X.shape
    W = np.ones(N) / N

    for m in range(self.M):
      tree = DecisionTreeClassifier(max_depth=1)
      tree.fit(X, Y, sample_weight=W)
      P = tree.predict(X)

      err = W.dot(P != Y)
      alpha = 0.5*(np.log(1 - err) - np.log(err))

      W = W*np.exp(-alpha*Y*P) # vectorized form
      W = W / W.sum() # normalize so it sums to 1
      weight_list.append(W)

      self.models.append(tree)
      self.alphas.append(alpha)

  def predict(self, X):
    # NOT like SKLearn API
    # we want accuracy and exponential loss for plotting purposes
    N, _ = X.shape
    FX = np.zeros(N)
    for alpha, tree in zip(self.alphas, self.models):
      FX += alpha*tree.predict(X)
    return np.sign(FX), FX

  def score(self, X, Y):
    # NOT like SKLearn API
    # we want accuracy and exponential loss for plotting purposes
    P, FX = self.predict(X)
    L = np.exp(-Y*FX).mean()
    return np.mean(P == Y), L


if __name__ == '__main__':
    
    
    cancer = datasets.load_breast_cancer()
    Y=cancer.target
    Y[Y == 0] = -1
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(cancer.data, Y, test_size=0.2,random_state=109) # 70% training and 30% test
   # make the targets -1,+1
    per=(Ytrain.shape[0])*(10/100) #20% data will be mislabeled

    k=random.sample(range(Ytrain.shape[0]), int(per)) #list of index no. of mislabeled data
    for i in k:
        if Ytrain[i-1]==1:
            Ytrain[i-1]=-1;
        else:
            Ytrain[i-1]=1
#mislabeled data created here till now
    model = AdaBoost(10)
    model.fit(Xtrain, Ytrain)

In [2]:
weight_list=np.array(weight_list)


In [3]:
#observing the wieght of instances
#print(wieght_list)
a=len(weight_list)

#sorting data and finding mean
nm=[] 
nm=np.mean(weight_list, axis=0)
#print(nm)
a=len(nm)
Q=[]
x=[]
#print(a)
for i in range(a):
    x=[i,nm[i]]
    Q.append(x)
    x=[]
#print(Q)
Q.sort(key = lambda Q: Q[1]) 

In [4]:
from scipy import stats
yp=np.array(nm)
mode=stats.mode(nm)
print(mode)

ModeResult(mode=array([0.00103373]), count=array([71]))


In [5]:
from collections import Counter
count=Counter(nm)
count1=Counter(nm)
# print(type(count))
nm2=nm



In [6]:
#mode calculating
print(count1)
modeg=statistics.mode(nm)
jk=float(count1[0])
print(jk)
ne=[]
for j in range(Xtrain.shape[0]):
    if nm2[j]==modeg:
        ne.append(j)



Counter({0.0010337337762558207: 71, 0.0012314062691593125: 68, 0.0010650151490398635: 65, 0.0009253712034402558: 29, 0.004521323871206521: 21, 0.0025264250310499795: 18, 0.0090718500669782: 17, 0.0007188453218357898: 16, 0.0008093885322242072: 16, 0.0010399917109167698: 15, 0.002164050724876334: 14, 0.010950191706367629: 14, 0.0011959004792315648: 10, 0.0009066506971284842: 9, 0.0007052285950477598: 6, 0.004168860896836484: 6, 0.000790067740314596: 6, 0.0016334558290290089: 4, 0.002471447835750366: 4, 0.005189411618212523: 4, 0.002095924698976401: 4, 0.0018191572693676598: 3, 0.003624349208874487: 3, 0.00219581680655393: 3, 0.010526842578661618: 3, 0.0014101460041119457: 2, 0.0008359721059619333: 2, 0.002109553519608255: 2, 0.00538380049833042: 2, 0.0010564427977506705: 2, 0.007118377004926751: 2, 0.0024618799633223075: 2, 0.001607335211202343: 2, 0.0046893014309660214: 1, 0.0029490153370896853: 1, 0.002145381460263727: 1, 0.005155067121065213: 1, 0.001380490864350724: 1, 0.00107960115

In [7]:
print(ne)

[4, 17, 35, 41, 42, 53, 64, 73, 77, 83, 102, 110, 113, 120, 121, 130, 135, 144, 156, 159, 166, 167, 168, 181, 183, 185, 193, 196, 202, 208, 215, 217, 219, 220, 238, 241, 243, 247, 250, 251, 257, 258, 260, 265, 277, 287, 294, 308, 309, 317, 321, 330, 340, 342, 343, 345, 349, 355, 360, 371, 377, 388, 390, 395, 409, 411, 415, 426, 450, 452, 454]


In [8]:
def common_member(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    
    if (a_set & b_set): 
        wq=a_set & b_set 
    else: 
        print("No common elements")  
    return wq

In [9]:
## culprit points 
a=set(ne)
b=set(k)
re=a&b
print(len(ne))# culprit points
print(len(k)) #total number of mislabeled point we introduced
print(len(re))#detected points as a real mislabeled points




71
45
7


## accuracy prediction

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [11]:

clf = SVC(kernel='linear') 

clf1 = KNeighborsClassifier() 

clf2 = DecisionTreeClassifier() 

clf3 = GaussianNB() 


# cancer=datasets.load_breast_cancer()

In [12]:
clf.fit(Xtrain, Ytrain)
y_pred_mis = clf.predict(Xtest)
clf1.fit(Xtrain, Ytrain)
y_pred_mis1 = clf1.predict(Xtest)
clf2.fit(Xtrain, Ytrain)
y_pred_mis2 = clf2.predict(Xtest)
clf3.fit(Xtrain, Ytrain)
y_pred_mis3 = clf3.predict(Xtest)


print(accuracy_score(Ytest, y_pred_mis))
print(accuracy_score(Ytest, y_pred_mis1))
print(accuracy_score(Ytest, y_pred_mis2))
print(accuracy_score(Ytest, y_pred_mis3))

0.9736842105263158
0.9473684210526315
0.8596491228070176
0.9473684210526315


In [18]:
for i in ne:
    if(Ytrain[i]==1):
        Ytrain[i]=-1
    else:
        Ytrain[i]=-1

In [21]:
##accuracy after fliping the culprit points 
clf.fit(Xtrain, Ytrain)
y_pred_mis = clf.predict(Xtest)
clf1.fit(Xtrain, Ytrain)
y_pred_mis1 = clf1.predict(Xtest)
clf2.fit(Xtrain, Ytrain)
y_pred_mis2 = clf2.predict(Xtest)
clf3.fit(Xtrain, Ytrain)
y_pred_mis3 = clf3.predict(Xtest)  

In [22]:
print(accuracy_score(Ytest, y_pred_mis))
print(accuracy_score(Ytest, y_pred_mis1))
print(accuracy_score(Ytest, y_pred_mis2))
print(accuracy_score(Ytest, y_pred_mis3))

0.9736842105263158
0.9473684210526315
0.8421052631578947
0.9473684210526315
