# Task-A: Binary Classification

In [1]:
import pandas as pd
import numpy as np

## Data Munging

### Reading Data

In [2]:
trainData = pd.DataFrame(pd.read_csv("train_wbcd.csv"))
testData  = pd.DataFrame(pd.read_csv("test_wbcd.csv"))
trainClass = trainData['Diagnosis']
testClass = testData['Diagnosis']
print('No of features in train and test data are')
print(len(list(trainData))-2)
print(len(list(testData))-2)

No of features in train and test data are
30
30


In [3]:
DataClassB = trainData[trainData['Diagnosis']=='B']
DataClassM = trainData[trainData['Diagnosis']=='M']

print("LengthOfClassB",len(DataClassB))
print("LengthOfClassM",len(DataClassM))

LengthOfClassB 58
LengthOfClassM 42


In [4]:
trainData = trainData.drop(['Diagnosis','Patient_ID'],axis=1).copy()
testData = testData.drop(['Diagnosis','Patient_ID'],axis=1).copy()

As we can see the ratio of both classes of data is 42/58 is 0.7 so data is fairly balanced. If this ratio would have been below 0.5 then we would have classified this data as unbalanced

### Features with missing entries

In [5]:
null_data = trainData[trainData.isnull().any(axis=1)]

In [6]:
(null_data)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30
70,13.62,23.23,87.19,573.2,0.09246,0.06747,0.02974,0.02443,0.1664,0.05801,...,,29.09,97.58,729.8,0.1216,0.1517,0.1049,0.07174,0.2642,0.06953
74,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


As we can see f21 has got NaN values so it is a feature with missing values

### Filling the missing feature values with mean of the respective feature vector

In [7]:
trainData = trainData.fillna(trainData.mean())
testData = testData.fillna(testData.mean())

### Normalizing the Training and Testing Data(Z-Score Normalization)

In [8]:
for i in range(0,len(trainData.T)):
    trainData.T.iloc[i] = ((trainData.T.iloc[i] - np.mean(trainData.T.iloc[i]))/np.std(trainData.T.iloc[i]))
    
for i in range(0,len(testData.T)):
    testData.T.iloc[i] = ((testData.T.iloc[i] - np.mean(testData.T.iloc[i]))/np.std(testData.T.iloc[i]))

print('Train and Test Data Normalized!!')

Train and Test Data Normalized!!


In [9]:
len(trainData)

100

In [10]:
testData

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30
0,-1.087488,-0.587788,-1.079973,-0.78511,0.724397,-0.923772,-0.936765,-0.956143,0.915178,0.833759,...,-1.102548,-0.900914,-1.065709,-0.726083,-0.065492,-1.212458,-1.23084,-1.426554,-0.010344,-0.364117
1,-0.350978,-0.709063,-0.381589,-0.359932,0.1907,-0.909882,-0.633032,-0.153026,-0.885389,-0.341121,...,-0.36869,-0.74516,-0.406849,-0.362366,-0.810933,-1.19865,-0.942635,-0.407719,-1.333492,-0.906747
2,-0.33235,-0.49815,-0.364556,-0.366212,-0.567675,-0.705465,-0.634842,-0.637866,-1.170038,-0.593194,...,-0.378679,-0.482838,-0.40407,-0.370835,-0.303323,-0.476537,-0.538112,-0.442866,-0.700336,-0.419691
3,-0.899344,0.2295,-0.895942,-0.683829,0.517002,-0.859088,-0.705204,-0.66894,0.458416,0.003014,...,-0.786563,0.54186,-0.785954,-0.587673,0.569907,-0.934778,-0.941646,-0.8901,-0.31726,-0.279176
4,-1.175506,-0.034142,-1.10736,-0.820153,-0.198512,0.569547,0.557579,-0.460792,1.11046,1.822324,...,-1.07924,0.292654,-1.032592,-0.714829,0.939077,1.144315,1.303197,-0.305409,0.194266,2.682921
5,0.126369,0.769964,0.171506,-0.022262,2.445778,1.529434,0.825679,1.140414,1.206446,0.901709,...,0.603571,2.281387,0.598231,0.333048,3.054708,1.743964,0.957069,1.437852,1.456031,0.875662
6,-0.861622,-1.296982,-0.875568,-0.666813,-0.849042,-1.288653,-0.749887,-0.575262,-1.11046,-0.213988,...,-0.843167,-1.438676,-0.855198,-0.614938,0.491813,-1.095863,-0.893612,-0.398897,-0.447983,-0.336105
7,3.295483,1.43434,3.355815,3.832896,0.793529,1.985535,3.175228,2.902929,0.577572,-1.068845,...,3.407147,0.563174,3.439783,3.930776,-0.029995,1.108296,1.986977,2.308699,-0.579844,-0.675418
8,0.314979,0.841147,0.272039,0.109201,-0.49647,-0.351779,-0.58586,-0.28052,-0.666938,-0.802524,...,0.089139,0.192643,0.005373,-0.072204,-0.889026,-0.552577,-0.798108,-0.43817,-0.715114,-0.757196
9,1.646892,0.179408,1.65244,1.327206,1.388062,1.751265,1.96482,2.480237,2.258983,0.078636,...,1.619118,0.64351,1.584784,1.243792,0.804188,1.265712,1.285773,2.394077,2.735984,0.595537


### Data PreProcessing

In [11]:
trainClass = np.array(trainClass)
trainClass = np.reshape(trainClass,(1,100))
trainData = np.array(trainData)
trainClassConverted = []
print(np.shape(trainData))
for i in range(0,len(trainClass[0])):
    if(trainClass[0][i]=='B'):
        trainClassConverted.append(0)
    else:
        trainClassConverted.append(1)
        
testClass = np.array(testClass)
testClass = np.reshape(testClass,(1,20))
testData = np.array(testData)
testClassConverted = []
for i in range(0,len(testClass[0])):
    if(testClass[0][i]=='B'):
        testClassConverted.append(0)
    else:
        testClassConverted.append(1)

(100, 30)


## Implementing Logistic Regression with L1(Lasso) Regularization¶

In [12]:
import sklearn
from sklearn.linear_model import Lasso
from sklearn import metrics

regressor = Lasso(random_state=0,alpha=0.1)
regressor.fit(trainData,np.array(trainClassConverted))
predLasso = regressor.predict(testData)

score = np.sqrt(metrics.mean_squared_error(predLasso,testClassConverted))

print("Final Root Mean Square Error: ",(score))

Final Root Mean Square Error:  0.3236197989057903


### Accuracy Score

In [13]:
for i in range(0,len(predLasso)):
    if(predLasso[i]<=0.5):
        predLasso[i] = 0
    else:
        predLasso[i] = 1
print("Accuracy: ",(sklearn.metrics.accuracy_score(testClassConverted,predLasso))*100," %")

Accuracy:  95.0  %


### Precision Score | F1 Scores

In [14]:

precision = (sklearn.metrics.precision_recall_fscore_support(testClassConverted,predLasso1))
print("===================")
print("Precision | F1 Scores: ")
print("===================")
print(precision)

NameError: name 'predLasso1' is not defined

### Confusion Matrix

In [None]:
confusionMatrix = sklearn.metrics.confusion_matrix(testClassConverted,predLasso1)

print("================")
print("Confusion Matrix")
print("================")
print(confusionMatrix)

## Implementing Logistic Regression with L2(Ridge) Regularization

In [None]:
import sklearn
from sklearn.linear_model import Ridge

regressor = Ridge(alpha=0.1)

regressor.fit(trainData,trainClassConverted)
predRidge = regressor.predict(testData)

score = np.sqrt(metrics.mean_squared_error(predRidge,testClassConverted))

print("Final Root Mean Square Error: ",(score))

In [None]:
predRidge = np.around(predRidge)
print("Accuracy: ",(sklearn.metrics.accuracy_score(testClassConverted,predRidge))*100," %")

In [None]:
precision = (sklearn.metrics.precision_recall_fscore_support(testClassConverted,predRidge))
print("===================")
print("Precision | F1 Scores: ")
print("===================")
print(precision)

In [None]:
confusionMatrix = sklearn.metrics.confusion_matrix(testClassConverted,predRidge)

print("================")
print("Confusion Matrix")
print("================")
print(confusionMatrix)

### Choosing the best hyperparameter

In [None]:
import random 
alphaValues = [0.1,1,3,10,33,100,333,1000, 3333, 10000, 33333]
meanAccuracyScores = []
accuracyScores = []
for i in range(0,len(alphaValues)):
    currentAlphaValue = alphaValues[i]

    for j in range(0,100):
        rand = random.randint(0,50)
        currentSet = trainData[rand:rand+40]
        currentClass = trainClassConverted[rand:rand+40]
        

        regressor = Lasso(random_state=0,alpha=currentAlphaValue)
        regressor.fit(currentSet,np.array(currentClass))
        predLassoCV = regressor.predict(currentSet)
        predLassoCV = np.around(predLassoCV)
        score = np.sqrt(metrics.mean_squared_error(predLassoCV,currentClass))
        accuracyScores.append(np.sqrt(metrics.mean_squared_error(predLassoCV,currentClass)))
#         accuracyScores.append((sklearn.metrics.accuracy_score(currentClass,predLassoCV))*100)
    meanAccuracyScores.append(np.mean(accuracyScores[i]))

print(meanAccuracyScores)

In [None]:
maxAccuracyHyperParameter = meanAccuracyScores.index(min(meanAccuracyScores))
bestHyperParameter = alphaValues[maxAccuracyHyperParameter]
print("The Best HyperParameter Alpha is: ",bestHyperParameter)

### L1 Regression with Best Hyperparameter determined using Cross Validation Approach

In [None]:
import sklearn
from sklearn.linear_model import Lasso
from sklearn import metrics

regressor = Lasso(random_state=0,alpha=bestHyperParameter)
regressor.fit(trainData,trainClassConverted)
predLasso = regressor.predict(testData)

score = np.sqrt(metrics.mean_squared_error(predLasso,testClassConverted))

print("Final Root Mean Square Error: ",(score))

### Accuracy Score

In [None]:
predLasso = np.around(predLasso)
print("Accuracy: ",(sklearn.metrics.accuracy_score(testClassConverted,predLasso))*100," %")

### Precision Score | F1 Scores

In [None]:
precision = (sklearn.metrics.precision_recall_fscore_support(testClassConverted,predLasso))
print("===================")
print("Precision | F1 Scores: ")
print("===================")
print(precision)

### Confusion Matrix

In [None]:
confusionMatrix = sklearn.metrics.confusion_matrix(testClassConverted,predLasso)

print("================")
print("Confusion Matrix")
print("================")
print(confusionMatrix)

## Implementing Logistic Regression with L2(Ridge) Regularization

In [None]:
import sklearn
from sklearn.linear_model import Ridge

import random 
lambdaValues = [0.001, 0.003, 0.01, 0.03, 0.1,0.3,1,3,10,33]
meanAccuracyScores = []
accuracyScores = []
currentLambdaValue = []
for i in range(0,len(lambdaValues)):
    currentLambdaValue = lambdaValues[i]

    for j in range(0,100):
        rand = random.randint(0,50)
        currentSet = trainData[rand:rand+90]
        currentClass = trainClassConverted[rand:rand+90]
        regressor = Ridge(alpha=currentLambdaValue)

        regressor.fit(currentSet,currentClass)
        predRidgeCV = regressor.predict(currentSet)
        predRidgeCV = np.around(predRidgeCV)
        score = np.sqrt(metrics.mean_squared_error(predRidgeCV,currentClass))
#         accuracyScores.append(np.sqrt(metrics.mean_squared_error(currentClass,predRidgeCV)))
        accuracyScores.append((sklearn.metrics.accuracy_score(currentClass,predRidgeCV))*100)
    meanAccuracyScores.append(np.mean(accuracyScores[i]))

print(meanAccuracyScores)

maxAccuracyHyperParameter = meanAccuracyScores.index(max(meanAccuracyScores))
bestHyperParameterL = lambdaValues[maxAccuracyHyperParameter]
print("The Best HyperParameter Lambda is: ",bestHyperParameterL)

### Accuracy Score

In [None]:
import sklearn
from sklearn.linear_model import Ridge

regressor = Ridge(alpha=bestHyperParameterL)

regressor.fit(trainData,trainClassConverted)
predRidge = regressor.predict(testData)

score = np.sqrt(metrics.mean_squared_error(predRidge,testClassConverted))

print("Final Root Mean Square Error: ",(score))

In [None]:
predRidge = np.around(predRidge)
print("Accuracy: ",(sklearn.metrics.accuracy_score(testClassConverted,predRidge))*100," %")

### Precision Score

In [None]:
precision = (sklearn.metrics.precision_recall_fscore_support(testClassConverted,predRidge))
print("===================")
print("Precision | F1 Scores: ")
print("===================")
print(precision)

### Confusion Matrix

In [None]:
confusionMatrix = sklearn.metrics.confusion_matrix(testClassConverted,predRidge)

print("================")
print("Confusion Matrix")
print("================")
print(confusionMatrix)

### Selecting Top 5 Features using Correlation

# Task-B: Multiclass Classification

### Reading Data

In [None]:
MNISTData = pd.DataFrame(pd.read_csv("reduced_mnist.csv"))

In [None]:
from sklearn.model_selection import train_test_split

Y = MNISTData['label']
X = MNISTData.drop(['label'],axis=1).copy()

trainX, testX, trainY, testY = train_test_split(X,Y,
test_size=1/7.0, random_state=0)

In [None]:
print(len(MNISTData))
print(len(list(MNISTData)))
co_df = pd.concat([MNISTData])
with open("reduced_mnist.csv") as fin:
    fin.next()
    total = sum(int(r[1]) for r in csv.reader(fin))

### 2.1Training using OneVsRest Classifier

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [None]:
predictionsMNIST = OneVsRestClassifier(LinearSVC(random_state=0)).fit(trainX, trainY).predict(testX)

In [None]:
score = np.sqrt(metrics.mean_squared_error(predictionsMNIST,testY))

In [None]:
print("Root Mean Squared Error: ",score)

### Accuracy Score

In [None]:
print("Accuracy: ",(sklearn.metrics.accuracy_score(testY,predictionsMNIST))*100," %")

### Precision Score

In [None]:
precision = (sklearn.metrics.precision_recall_fscore_support(testY,predictionsMNIST))
print("===================")
print("Precision | F1 Scores: ")
print("===================")
print(precision)

### Confusion Matrix

In [None]:
confusionMatrix = sklearn.metrics.confusion_matrix(testY,predictionsMNIST)

print("================")
print("Confusion Matrix")
print("================")
print(confusionMatrix)

### 2.2 Choosing the best Hyperparameter

In [None]:
import random 
alphaValuesMNIST = [0.1, 1, 3, 10, 33, 100, 333, 1000, 3333, 10000, 33333]
meanAccuracyScores = []
accuracyScores = []
currentAlphaValueMNIST = []
for i in range(0,len(lambdaValues)):
    currentLambdaValue = lambdaValues[i]

    for j in range(0,10):
        rand = random.randint(0,10)
        currentSetMNIST = trainX[rand:rand+200]
        currentClassMNIST = trainY[rand:rand+200]
        regressor = Ridge(alpha=currentLambdaValue)

        regressor.fit(currentSetMNIST,currentClassMNIST)
        predMNISTCV = regressor.predict(currentSetMNIST)
        predMNISTCV = np.around(predMNISTCV)
        score = np.sqrt(metrics.mean_squared_error(predMNISTCV,currentClassMNIST))
#         accuracyScores.append(np.sqrt(metrics.mean_squared_error(currentClassMNIST,predMNISTCV)))
        accuracyScores.append((sklearn.metrics.accuracy_score(currentClassMNIST,predMNISTCV))*100)
    meanAccuracyScores.append(np.mean(accuracyScores[i]))

print(meanAccuracyScores)

maxAccuracyHyperParameter = meanAccuracyScores.index(max(meanAccuracyScores))
bestHyperParameterMNIST = alphaValuesMNIST[maxAccuracyHyperParameter]
print("The Best HyperParameter Lambda is: ",bestHyperParameterMNIST)

### Training over best hyperparameter

In [None]:
from sklearn import linear_model
regressor = linear_model.Ridge (alpha = bestHyperParameterMNIST)
regressor.fit(trainX,np.array(trainY))
predLassoCV = regressor.predict(testX)
predLassoCV = np.around(predLassoCV)
score = np.sqrt(metrics.mean_squared_error(predLassoCV,testY))

In [None]:
print("Mean Squared Error: ",score)

### Plotting average training accuracy vs average validation accuracy

In [None]:
accuracyScoreMNIST = ((sklearn.metrics.accuracy_score(testY,predLassoCV))*100)

In [None]:
import matplotlib.pyplot as plt
plt.scatter(accuracyScoreMNIST,accuracyScores[0],color='red')


In [None]:
plt.scatter(accuracyScoreMNIST,accuracyScores[1],color='blue')

## The model is overfitting as it is showing 100 % accuracy in cross validation set and around 20 % on test set