In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from scipy.spatial import distance
import cvxopt 
import copy

## Dataset Analysis


In [2]:
data = pd.read_csv('./drug_classification/drug200.csv')
data.head()


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [3]:
data['Sex'].replace(['F','M'],[0,1],inplace=True)
data['BP'].replace(['HIGH','NORMAL','LOW'],[0,1,2],inplace=True)
data['Cholesterol'].replace(['NORMAL','HIGH'],[0,1],inplace=True)
data['Drug'].replace(['drugA','drugB','drugC','drugX','DrugY'],[0,1,2,3,4],inplace=True)


In [4]:
X = np.array(data.iloc[:,:-1])
y = np.array(data.iloc[:,-1])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

### SVC

In [5]:
class SVM:
    linear = lambda x1,x2,c=0: x1@x2.T
    poly = lambda x1,x2,Q=5: (1+x1@x2.T)**Q
    rbf = lambda x1,x2, y=10 : np.exp(-y*distance.cdist(x1,x2,'sqeuclidean'))
    kernel_funs = {'linear': linear,'poly':poly,'rbf':rbf}
    
    def __init__(self,kernel='linear',C=1,k=2):
        self.kernel_str = kernel
        self.kernel = SVM.kernel_funs[kernel]
        self.C = C
        self.k = k
        
        self.X,self.y = None,None
        self.alphas = None
        
        self.multiclass = False
        self.clfs = []
        
        
        
        

In [6]:
SVMClass = lambda func: setattr(SVM, func.__name__,func) or func

In [7]:
@SVMClass
def fit(self,X,y,eval_train=False):
    if len(np.unique(y)) > 2:
        self.multiclass = True
        return self.multi_fit(X,y,eval_train)
    if set(np.unique(y)) == {0,1}: y[y==0] = -1
    
    self.y = y.reshape(-1,1).astype(np.double)
    self.X = X
    N = X.shape[0]
    
    self.K = self.kernel(X,X,self.k)
    
    P = cvxopt.matrix(self.y@self.y.T*self.K)
    q = cvxopt.matrix(-(np.ones((N,1))))
    
    A = cvxopt.matrix(self.y.T)
    b = cvxopt.matrix(np.zeros(1))
    
    G = cvxopt.matrix(np.vstack((-np.identity(N),np.identity(N))))
    h = cvxopt.matrix(np.vstack((np.zeros((N,1)),np.ones((N,1))*self.C)))
    
    cvxopt.solvers.options['show_progress'] = False
    sol = cvxopt.solvers.qp(P,q,G,h,A,b)
    self.alphas = np.array(sol['x'])
    
    self.is_sv = ((self.alphas -1e-3 >0)&(self.alphas<=self.C)).squeeze()
    self.margin_sv = np.argmax((0<self.alphas-1e-3)&(self.alphas<self.C-1e-3))
    
    if eval_train:
        print(f"Finished training with accuracy{self.evaluate(X,y)}")
        
    
@SVMClass
def predict(self,X_t):
    if self.multiclass: return self.multi_predict(X_t)

    xs,ys = self.X[self.margin_sv,np.newaxis], self.y[self.margin_sv]
    
    alphas,y,X = self.alphas[self.is_sv],self.y[self.is_sv], self.X[self.is_sv]
    b = ys - np.sum(alphas*y*self.kernel(X,xs,self.k),axis=0)
    
    score = np.sum(alphas*y*self.kernel(X,X_t,self.k),axis=0) + b
    
    return np.sign(score).astype(int),score

In [8]:
@SVMClass
def evaluate(self, X,y):  
    outputs, _ = self.predict(X)
    accuracy = np.sum(outputs == y) / len(y)
    return round(accuracy, 2)

In [9]:
@SVMClass
def multi_fit(self, X, y, eval_train=False):
    self.k = len(np.unique(y))      # number of classes
    # for each pair of classes
    for i in range(self.k):
        # get the data for the pair
        Xs, Ys = X, copy.copy(y)
        # change the labels to -1 and 1
        Ys[Ys!=i], Ys[Ys==i] = -1, +1
        # fit the classifier
        clf = SVM(kernel=self.kernel_str, C=self.C, k=self.k)
        clf.fit(Xs, Ys)
        # save the classifier
        self.clfs.append(clf)
    if eval_train:  
        print(f"Finished training with accuracy {self.evaluate(X, y)}")

@SVMClass
def multi_predict(self, X):
    # get the predictions from all classifiers
    N = X.shape[0]
    preds = np.zeros((N, self.k))
    for i, clf in enumerate(self.clfs):
        _, preds[:, i] = clf.predict(X)
    
    # get the argmax and the corresponding score
    return np.argmax(preds, axis=1), np.max(preds, axis=1)

In [10]:
from sklearn.datasets import make_classification
import numpy as np

# Load the dataset
np.random.seed(1)
# X, y = make_classification(n_samples=2500, n_features=5, 
#                            n_redundant=0, n_informative=5, 
#                            n_classes=2,  class_sep=0.3)

# Test Implemented SVM
svm = SVM(kernel='poly', k=1)
svm.fit(X_train, y_train, eval_train=True)

y_pred, _ = svm.predict(X_test)
print(f"Accuracy: {np.sum(y_test==y_pred)/y.shape[0]}")

# Test with Scikit
from sklearn.svm import SVC
clf = SVC(kernel='linear', C=20, gamma=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"Accuracy: {sum(y_test==y_pred)/y.shape[0]}")  

ValueError: Rank(A) < p or Rank([P; A; G]) < n

In [37]:
from sklearn.datasets import make_classification
import numpy as np

# Load the dataset
np.random.seed(1)
X, y = make_classification(n_samples=500, n_features=2, 
                           n_redundant=0, n_informative=2, 
                           n_classes=4, n_clusters_per_class=1,  
                           class_sep=0.3)

# Test SVM
svm = SVM(kernel='rbf', k=4)
svm.fit(X, y, eval_train=True)

y_pred = svm.predict(X)
print(f"Accuracy: {np.sum(y==y_pred)/y.shape[0]}") # 0.65

# Test with Scikit
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

clf = OneVsRestClassifier(SVC(kernel='rbf', C=1, gamma=4)).fit(X, y)
y_pred = clf.predict(X)
print(f"Accuracy: {sum(y==y_pred)/y.shape[0]}")    # 0.65

Finished training with accuracy 0.65
Accuracy: 0.65
Accuracy: 0.65


## KNN

In [28]:
class KNN:
    
    def __init__(self,k=3,distance='sqeuclidean'):
        self.k = k
        self.distance = distance
    
    def fit(self,X,y):
        self.X = X
        self.y = y
    
    def predict(self,X_1):
        y_pred = np.zeros((X_1.shape[0]))
        distances=distance.cdist(X_1,self.X,self.distance)   
        min_distances = np.argpartition(distances,self.k,axis=-1)[:,0:self.k]

        for i in range(min_distances.shape[0]):
            classes = self.y[min_distances[i,:]]
            clas = {}
            for cls in classes:
                if cls in clas:
                    clas[cls] += 1
                else:
                    clas[cls] = 1
            y_pred = max(clas,key= lambda k: clas.get(k))
        return y_pred

In [29]:
knn = KNN(k=4,distance='euclidean')
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

In [30]:
print(f"Accuracy: {np.sum(y_test==y_pred)/y_test.shape[0]}") 

Accuracy: 0.43333333333333335


## Logistic Regression

In [1]:
class LR:
    def __init__(self):
        self.sol = None
    
    def GD(self,X,y):
        value = 
        
    def fit(self,X,y,init='random',lr=0.01):
        if init == 'random':
            self.sol = np.random.rand((X.shape[1],1))
        else:
            self.sol = np.zeros((X.shape[1],1))
        
        