### $\Large\textbf{Part(a)}$

In [1]:
import pandas as pd

In [2]:
import numpy as np

# Open the file and read its contents
with open('Data_Q1.txt', 'r') as f:
    lines = f.readlines()

# Initialize empty lists to hold the labels and features
labels = []
features = []

# Loop through each line in the file
for line in lines:
    # Split the line into its label and feature components
    parts = line.strip().split(' ')
    label = int(parts[0])
    feature_strs = parts[1:]

    # Initialize an empty dictionary to hold the feature values
    feature_dict = {}

    # Loop through each feature in the line and add it to the dictionary
    for feature_str in feature_strs:
        feature_parts = feature_str.split(':')
        feature_id = int(feature_parts[0])
        feature_val = float(feature_parts[1])
        feature_dict[feature_id] = feature_val

    # Append the label and feature dictionary to their respective lists
    labels.append(label)
    features.append(feature_dict)

# Determine the maximum feature ID in the dataset
max_feature_id = max(max(feature_dict.keys()) for feature_dict in features)

# Initialize a numpy array of zeros to hold the feature values
X = np.zeros((len(features), max_feature_id))

# Loop through each sample and set its feature values in the X array
for i, feature_dict in enumerate(features):
    for feature_id, feature_val in feature_dict.items():
        X[i, feature_id-1] = feature_val

# Convert the labels list to a numpy array
y = np.array(labels)

In [3]:
print(X)

[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 1. 1.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:
print(y)

[ 1 -1 -1 ...  1 -1  1]


In [5]:
X.shape

(4143, 54877)

Since, the dataset is toolarge, we'll apply dimension reduction technique i.e., PCA to reduce the dimension of the dataset.

In [6]:
from sklearn.decomposition import PCA

my_model = PCA(n_components=520)
X_new=my_model.fit_transform(X)

In [7]:
# print (my_model.explained_variance_)
# print (my_model.explained_variance_ratio_)
print (my_model.explained_variance_ratio_.cumsum())

[0.06151249 0.10554605 0.1291997  0.14915661 0.16583007 0.17891754
 0.19130718 0.20264855 0.2128589  0.22274433 0.23229439 0.24062825
 0.2482886  0.25581608 0.26315092 0.26967828 0.27586312 0.28186658
 0.2876791  0.29331067 0.29892204 0.30427433 0.30956145 0.31472194
 0.31966877 0.32443239 0.3289975  0.33352215 0.33797888 0.34224001
 0.34643139 0.35059192 0.3546258  0.35854086 0.36240983 0.36623747
 0.3699218  0.37357193 0.37711375 0.38056736 0.38399015 0.38733268
 0.39064541 0.39388086 0.39708878 0.40021163 0.40328667 0.40633915
 0.40936083 0.41232849 0.41528644 0.41815654 0.42097173 0.42375116
 0.42643715 0.42908486 0.43169565 0.43423838 0.43675617 0.43927008
 0.44175135 0.44418376 0.44661012 0.44897991 0.45132254 0.45365853
 0.45598168 0.45822013 0.46044395 0.46263343 0.46480956 0.46695911
 0.46907389 0.47116247 0.47323623 0.47526996 0.47728671 0.4792883
 0.48127499 0.48324654 0.48520248 0.48714164 0.4890487  0.49094193
 0.49282388 0.49465412 0.49646814 0.4982701  0.50006522 0.50184

In [8]:
X_new

array([[-3.16355559e+00,  1.35449233e+00,  1.22609270e-01, ...,
        -9.90037988e-04, -5.49364913e-03, -2.67328444e-02],
       [ 2.94461102e+00, -1.85303372e+00, -3.42356929e+00, ...,
        -9.31028792e-02, -1.07548775e-01, -3.16967337e-02],
       [-2.42472556e+00,  1.07460981e+00, -9.15473282e-02, ...,
        -2.45097649e-02, -2.26305099e-01,  1.23648802e-01],
       ...,
       [ 8.89478955e-01, -9.01727203e-01,  2.04859232e+00, ...,
        -2.58756973e-01, -4.81303517e-01,  1.05033025e+00],
       [-1.75517906e+00,  5.72070547e-01,  1.17252505e-01, ...,
         9.38607214e-02,  6.72003502e-02, -8.24270882e-02],
       [ 1.22167361e+01,  1.40410531e+01, -6.38559838e-01, ...,
         6.23316654e-02,  1.35446900e-01, -1.84815462e-01]])

In [9]:
df = pd.DataFrame(X_new)

In [10]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,510,511,512,513,514,515,516,517,518,519
0,-3.163556,1.354492,0.122609,0.614129,-0.325644,-0.765679,-0.197722,0.345623,-0.583531,-0.061195,...,0.003820,0.060101,0.020149,0.023405,-0.038304,0.009178,0.000593,-0.000990,-0.005494,-0.026733
1,2.944611,-1.853034,-3.423569,-3.943277,-3.041524,3.619496,-3.783854,-4.853894,-5.449297,0.604696,...,-0.052582,0.082848,-0.001395,0.011221,-0.067226,0.023677,-0.154949,-0.093103,-0.107549,-0.031697
2,-2.424726,1.074610,-0.091547,0.004458,-0.401874,-0.567940,-0.354899,0.011724,-0.074278,0.295060,...,0.044600,-0.070424,-0.009781,0.070656,0.060470,0.023062,0.052919,-0.024510,-0.226305,0.123649
3,-1.785259,0.864062,0.235106,0.025431,0.331659,0.115276,-0.176223,-0.279108,-0.062008,0.453530,...,0.021739,-0.011732,0.001054,0.008765,-0.032580,-0.012291,-0.008854,-0.028305,-0.024816,-0.004646
4,-3.167082,1.337177,0.047686,0.561414,-0.369034,-0.765479,-0.196355,0.298536,-0.632671,-0.046335,...,-0.099963,-0.049586,-0.084686,-0.016154,-0.056588,0.039681,-0.058456,-0.039702,0.032842,0.087111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4138,-1.446324,0.642952,-0.330377,-0.011503,-0.000712,-0.501163,0.366723,0.138004,0.183987,0.713666,...,0.164399,0.415125,0.233824,0.007281,-0.318274,-0.178287,0.031619,0.074725,0.139038,0.362159
4139,-0.368333,0.016102,0.309881,0.342469,-0.103763,-0.023977,0.059648,-0.108008,0.518320,0.090268,...,0.115440,0.166630,-0.113661,0.204370,0.007903,-0.054292,-0.063204,0.054268,-0.054584,0.080094
4140,0.889479,-0.901727,2.048592,0.134772,-0.032090,0.193295,0.155847,-0.452384,1.554877,-0.544347,...,-0.421160,0.471329,0.179395,-0.424239,1.195420,1.178119,-0.078330,-0.258757,-0.481304,1.050330
4141,-1.755179,0.572071,0.117253,0.273678,-0.116884,-0.404077,-0.023015,0.529717,-0.194253,0.237860,...,0.197197,0.154660,0.112264,-0.135992,0.106883,-0.062000,0.174739,0.093861,0.067200,-0.082427


In [11]:
X_new

array([[-3.16355559e+00,  1.35449233e+00,  1.22609270e-01, ...,
        -9.90037988e-04, -5.49364913e-03, -2.67328444e-02],
       [ 2.94461102e+00, -1.85303372e+00, -3.42356929e+00, ...,
        -9.31028792e-02, -1.07548775e-01, -3.16967337e-02],
       [-2.42472556e+00,  1.07460981e+00, -9.15473282e-02, ...,
        -2.45097649e-02, -2.26305099e-01,  1.23648802e-01],
       ...,
       [ 8.89478955e-01, -9.01727203e-01,  2.04859232e+00, ...,
        -2.58756973e-01, -4.81303517e-01,  1.05033025e+00],
       [-1.75517906e+00,  5.72070547e-01,  1.17252505e-01, ...,
         9.38607214e-02,  6.72003502e-02, -8.24270882e-02],
       [ 1.22167361e+01,  1.40410531e+01, -6.38559838e-01, ...,
         6.23316654e-02,  1.35446900e-01, -1.84815462e-01]])

In [12]:
# df = pd.DataFrame(X)

In [13]:
# df

In [14]:
df.insert(0, 'Labels', y)

In [15]:
df.head(10)

Unnamed: 0,Labels,0,1,2,3,4,5,6,7,8,...,510,511,512,513,514,515,516,517,518,519
0,1,-3.163556,1.354492,0.122609,0.614129,-0.325644,-0.765679,-0.197722,0.345623,-0.583531,...,0.00382,0.060101,0.020149,0.023405,-0.038304,0.009178,0.000593,-0.00099,-0.005494,-0.026733
1,-1,2.944611,-1.853034,-3.423569,-3.943277,-3.041524,3.619496,-3.783854,-4.853894,-5.449297,...,-0.052582,0.082848,-0.001395,0.011221,-0.067226,0.023677,-0.154949,-0.093103,-0.107549,-0.031697
2,-1,-2.424726,1.07461,-0.091547,0.004458,-0.401874,-0.56794,-0.354899,0.011724,-0.074278,...,0.0446,-0.070424,-0.009781,0.070656,0.06047,0.023062,0.052919,-0.02451,-0.226305,0.123649
3,-1,-1.785259,0.864062,0.235106,0.025431,0.331659,0.115276,-0.176223,-0.279108,-0.062008,...,0.021739,-0.011732,0.001054,0.008765,-0.03258,-0.012291,-0.008854,-0.028305,-0.024816,-0.004646
4,-1,-3.167082,1.337177,0.047686,0.561414,-0.369034,-0.765479,-0.196355,0.298536,-0.632671,...,-0.099963,-0.049586,-0.084686,-0.016154,-0.056588,0.039681,-0.058456,-0.039702,0.032842,0.087111
5,1,-3.173292,1.359495,0.126549,0.621061,-0.324827,-0.770202,-0.199446,0.347291,-0.585765,...,-0.022517,0.101262,0.015745,0.017342,-0.075561,0.06635,-0.025293,-0.005848,-0.004052,-0.025131
6,-1,-3.151312,1.341245,0.126068,0.557534,-0.350916,-0.762246,-0.144797,0.259317,-0.565195,...,0.011294,0.103082,-0.054937,0.027524,-0.034878,-0.00138,-0.041615,-0.067991,0.042516,0.017005
7,-1,-0.041639,-1.157616,-2.94879,-1.464512,0.698901,-1.345332,-1.400983,2.207661,1.297725,...,-8.8e-05,0.001768,0.036919,0.036221,0.005923,0.002326,0.014178,-0.013862,0.005604,0.000212
8,-1,-1.377654,0.05779,-0.308212,-0.176832,-0.001134,-0.371665,-0.268557,0.48249,-0.062888,...,-0.42807,0.314901,-0.467783,0.160548,-0.035823,0.53393,0.155909,-0.091103,-0.233084,-0.101171
9,1,-0.668844,0.056571,-0.213935,0.383677,-0.565103,0.149276,-0.061215,0.308903,-0.469805,...,0.292289,-0.173074,-0.201735,0.009236,-0.343089,-0.088519,-0.096788,0.141411,0.100367,-0.2447


### $\Large\textbf{Part(b)}$

In [16]:
df['Labels'].value_counts()

 1    2210
-1    1933
Name: Labels, dtype: int64

**No, there is not class imbalance issue. Since, number of samples for class 1 and -1 don't differ so much.**

### $\Large\textbf{Part(c)}$

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X_new,y ,random_state=104, test_size=0.20, shuffle=True)

In [18]:
train_data = pd.DataFrame(X_train.copy())

In [19]:
train_data.insert(0, 'Labels', y_train)

In [20]:
train_data

Unnamed: 0,Labels,0,1,2,3,4,5,6,7,8,...,510,511,512,513,514,515,516,517,518,519
0,-1,-1.039071,0.009496,0.067575,0.689276,-0.428018,0.409418,0.212412,-0.043705,-0.132766,...,0.320172,-0.280763,-0.226095,-0.152997,-0.260018,-0.221543,-0.114043,-0.144748,-0.132854,-0.063064
1,1,-1.783699,0.152384,-0.003021,0.310478,0.001870,-0.243077,0.362368,0.216989,-0.227199,...,-0.040482,0.024195,0.101638,0.033153,-0.011314,0.290847,0.169626,-0.114721,0.016304,-0.158889
2,1,-2.298032,0.693171,0.316903,0.525067,-0.124575,-0.316613,-0.262199,-0.074170,0.125734,...,-0.038962,-0.076381,-0.042632,0.190299,-0.069709,-0.112635,-0.027490,-0.017056,0.090063,-0.014293
3,1,-0.118541,-0.885698,0.014662,0.354838,-0.208659,-0.068768,-0.194481,0.678728,0.831914,...,-0.289166,-0.272543,-0.184063,0.039644,0.132462,-0.004139,-0.323750,-0.237291,-0.024558,-0.162689
4,-1,-2.338033,0.315084,-1.086959,-1.280238,9.976684,2.205414,0.310788,-0.763790,-0.557822,...,0.011762,-0.024332,-0.101614,0.018798,-0.114098,0.072583,0.079041,0.073613,-0.066852,-0.063503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3309,1,-3.084614,1.602065,0.073332,0.569104,-0.300533,-0.775108,-0.213606,0.434235,-0.689919,...,0.005640,0.013835,-0.023102,0.004181,0.024730,0.033632,0.009606,-0.000594,0.000232,0.009908
3310,1,7.300280,-6.042842,-4.936389,-4.704151,-0.809864,0.317562,-3.335242,-0.284289,0.937901,...,-0.023726,-0.046528,-0.185711,-0.020810,0.256027,0.018287,0.129555,-0.098553,-0.280828,0.030132
3311,1,1.218726,-0.392534,3.061458,0.649521,-0.842342,1.507682,-0.037563,-2.178591,1.595719,...,-0.022117,0.063028,0.023287,-0.007970,-0.012067,0.082583,0.024956,0.010401,-0.089572,0.059250
3312,-1,-2.329459,0.307269,-1.059447,-1.311362,10.081118,2.251876,0.356256,-0.838361,-0.530938,...,0.022253,0.016871,0.140556,-0.052450,0.148332,-0.080015,-0.091612,-0.061432,0.080013,0.090590


In [21]:
test_data = pd.DataFrame(X_test.copy())
test_data.insert(0, 'Labels', y_test)
test_data

Unnamed: 0,Labels,0,1,2,3,4,5,6,7,8,...,510,511,512,513,514,515,516,517,518,519
0,-1,-0.059278,-0.524923,-0.381460,-0.429855,0.576556,0.233927,0.474237,0.717917,0.345152,...,-0.010222,0.192978,0.188560,0.035730,0.086566,0.092732,-0.168813,0.114429,-0.415690,0.012432
1,-1,-0.952151,-0.295575,-1.541324,-1.111016,-0.090549,-0.289403,-0.113607,0.291050,0.506458,...,-0.116827,-0.046791,-0.495561,-0.058068,-0.162131,-0.132096,0.241108,-0.586222,0.340069,-0.046269
2,-1,-1.894287,0.714721,0.596764,0.755272,-0.535002,0.039263,0.086175,-0.023623,-0.017627,...,0.173324,-0.178970,0.029844,-0.043502,0.114961,-0.431774,-0.044987,-0.015400,-0.070218,0.065966
3,1,11.811218,-8.512016,9.051152,-1.469777,2.481459,-5.332863,-2.997072,0.834283,-1.687244,...,-0.184900,0.114462,-0.010357,-0.239816,-0.097504,-0.049876,-0.624349,1.117170,-0.161854,0.175257
4,-1,-2.335408,0.747366,0.452392,0.342893,-0.359686,-0.071687,0.214835,0.134342,-0.259464,...,0.276105,-0.061621,-0.244448,-0.042089,-0.097203,0.039878,-0.255603,-0.215578,0.381308,0.310693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824,-1,2.471277,-1.646650,-2.921166,-4.128645,-1.167553,2.352743,-2.199456,-4.345573,-1.197475,...,0.049071,0.071923,-0.014776,-0.016007,0.011275,0.128003,0.119203,-0.007158,-0.236152,-0.202326
825,1,-3.138283,1.505443,0.074280,0.558178,-0.306863,-0.764891,-0.216171,0.422470,-0.683813,...,-0.012135,0.034605,-0.039965,-0.015127,0.031285,0.028599,-0.008398,-0.004972,-0.031710,0.015532
826,-1,-3.155592,1.318856,0.016208,0.525506,-0.344629,-0.610560,-0.233485,0.193882,-0.843261,...,0.089734,0.035664,0.061349,-0.006594,0.078205,0.043207,0.043982,0.093089,0.027115,-0.093106
827,-1,-1.509192,-0.105267,-0.947249,-0.152706,-0.367935,-0.327515,-0.253428,0.920542,-0.337864,...,-0.092496,0.024295,-0.009437,0.000859,0.000037,-0.106850,0.065515,0.031912,0.050987,0.062513


In [22]:
train_data['Labels'].value_counts()

 1    1788
-1    1526
Name: Labels, dtype: int64

In [23]:
test_data['Labels'].value_counts()

 1    422
-1    407
Name: Labels, dtype: int64

**There is no class imbalance issue in train and test data also i.e., they are having similar class proportions.**

### $\Large\textbf{Part(d)}$

In [24]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge,RidgeCV
from sklearn.linear_model import Lasso,LassoCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

In [25]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### $\Large\textbf{Part(d(i))}$

In [26]:
param_range = [1e-5,1e-4,1e-3,1e-2,1e-1,1,10,100,1000,10000,1e5]

In [27]:
pipeline = make_pipeline(LogisticRegression(solver='lbfgs', penalty='l2', random_state=1))
train_scores, val_scores = validation_curve(estimator=pipeline, X=X_train, y=y_train, cv=5, param_name='logisticregression__C', param_range=param_range)
print('train scores:',train_scores)
print('val scores:',val_scores)

print('Printing more details of scores for each alpha:')
for i in range(len(param_range)):
  print('alpha:', param_range[i])
  print('train scores:', train_scores[i])
  print('val scores:', val_scores[i])
  print('**************************')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

train scores: [[0.5397963  0.53941909 0.53941909 0.53941909 0.53959276]
 [0.75330064 0.7453791  0.74915126 0.74500189 0.75527903]
 [0.8434553  0.83213882 0.83591098 0.83553376 0.83220211]
 [0.89701999 0.89626556 0.90305545 0.90041494 0.90196078]
 [0.93323274 0.93247831 0.93511882 0.93587326 0.9392911 ]
 [0.95699736 0.95548849 0.96001509 0.95435685 0.9566365 ]
 [0.97925311 0.97623538 0.97849868 0.97698982 0.97322775]
 [0.98377971 0.98227084 0.98755187 0.98679743 0.98491704]
 [0.98755187 0.98377971 0.99094681 0.9905696  0.98717949]
 [0.98906073 0.98377971 0.99170124 0.99094681 0.98717949]
 [0.9883063  0.98302527 0.9905696  0.99094681 0.98717949]]
val scores: [[0.53846154 0.53996983 0.53996983 0.53996983 0.53927492]
 [0.70135747 0.73906486 0.78129713 0.7571644  0.74622356]
 [0.78431373 0.826546   0.84917044 0.8280543  0.82628399]
 [0.84766214 0.88386124 0.88235294 0.87028658 0.86404834]
 [0.87933635 0.88687783 0.88536953 0.89140271 0.8776435 ]
 [0.86726998 0.88084465 0.85972851 0.87330317

In [28]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [29]:
accuracy_train=[]
recall_train=[]
precision_train=[]
f1_score_train=[]
specifity_train=[]
senstivity_train=[]
accuracy_test=[]
recall_test=[]
precision_test=[]
f1_score_test=[]
specifity_test=[]
senstivity_test=[]
best_hyper_parameter=[]
def metrics(X_train,y_train,X_test,y_test,y_pred_train,y_pred_test):
  accuracy_train.append(accuracy_score(y_train,y_pred_train))
  recall_train.append(recall_score(y_train,y_pred_train))
  precision_train.append(precision_score(y_train,y_pred_train))
  f1_score_train.append(f1_score(y_train,y_pred_train))
  tn, fp, fn, tp = confusion_matrix(y_train, y_pred_train).ravel()
  specificity1 = tn / (tn+fp)
  senstivity1= tp / (tp + fn)
  specifity_train.append(specificity1)
  senstivity_train.append(senstivity1)
  accuracy_test.append(accuracy_score(y_test,y_pred_test))
  recall_test.append(recall_score(y_test,y_pred_test))
  precision_test.append(precision_score(y_test,y_pred_test))
  f1_score_test.append(f1_score(y_test,y_pred_test))
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()
  specificity1 = tn / (tn+fp)
  senstivity1= tp / (tp + fn)
  specifity_test.append(specificity1)
  senstivity_test.append(senstivity1)

In [30]:
#print(train_scores.shape)
avg_train_scores = np.mean(train_scores,axis=1)
avg_val_scores = np.mean(val_scores,axis=1)
print('average train scores :',avg_train_scores)
print('average val scores :',avg_val_scores)

average train scores : [0.53952926 0.74962239 0.83584819 0.89974335 0.93519885 0.95669886
 0.97684095 0.98506338 0.98800549 0.9885336  0.98800549]
average val scores : [0.53952919 0.74502149 0.82287369 0.86964225 0.88412599 0.87085162
 0.86602416 0.8681376  0.86783685 0.86813805 0.86693187]


In [31]:
#best alpha

best_alpha = param_range[np.argmax(avg_val_scores)]
print('best alpha from 5 fold CV:',best_alpha)

best alpha from 5 fold CV: 0.1


In [32]:
clf = LogisticRegression(solver='lbfgs', penalty='l2', C = best_alpha, random_state=1)

clf.fit(X_train, y_train)

In [33]:
print(clf.coef_)

[[ 0.38611527  0.42449703  0.87258799  0.8499544  -0.39803513  0.33051075
  -0.00920709  0.43635468 -0.08709481 -0.73244412 -0.29213645 -0.36988026
   1.05602559  0.03983041 -0.11836814 -0.27957495 -0.07205944 -0.3145592
  -0.1164937   0.58150771 -0.0892014  -0.13622313  0.00617414  0.07363905
   0.25489573  0.17305378 -0.16509012 -0.43202701 -0.08411671  0.16569824
  -0.16209135 -0.05067727 -0.35732681 -0.33478028  0.12104278 -0.04484708
   0.50683375 -0.35617284 -0.12495716  0.14418821 -0.02725553 -0.11571081
  -0.13434008 -0.56923299 -0.29313347 -0.01176833  0.16176277  0.36815003
  -0.12631232  0.08221771  0.06598408  0.42966795  0.26788056 -0.05066988
  -0.15925258 -0.15777657  0.0592614  -0.08691383 -0.17810572  0.25645892
  -0.20153491  0.2178147   0.03697238  0.36266681  0.03873862 -0.50413918
  -0.06344263  0.2997841   0.04468871  0.52228421  0.39040998  0.06778493
  -0.02789476  0.310675    0.31583807 -0.44341371 -0.27619357 -0.11513846
  -0.27498844 -0.00282799  0.56321381  

In [34]:
# class_ = [-1, 1]
# for k in class_:
#   print(k)
#   print('class:',k, 'coefficients:', clf.coef_[k])
y_pred_train =clf.predict(X_train)
y_pred_test=clf.predict(X_test)
metrics(X_train,y_train,X_test,y_test,y_pred_train,y_pred_test)

In [35]:
accuracy_train

[0.9330114665057333]

In [36]:
accuracy_test

[0.887816646562123]

### $\Large\textbf{Part(d(iii))}$

In [37]:
pipeline = make_pipeline(LinearSVC(penalty='l2'))
train_scores, val_scores = validation_curve(estimator=pipeline, X=X_train, y=y_train, cv=5, param_name='linearsvc__C', param_range=param_range)
print('train scores:',train_scores)
print('val scores:',val_scores)

print('Printing more details of scores for each alpha:')
for i in range(len(param_range)):
  print('alpha:', param_range[i])
  print('train scores:', train_scores[i])
  print('val scores:', val_scores[i])
  print('**************************')



train scores: [[0.83968314 0.79932101 0.77744247 0.7849868  0.77941176]
 [0.8457186  0.84420973 0.85665786 0.8457186  0.85784314]
 [0.89626556 0.89437948 0.90079215 0.90003772 0.90120664]
 [0.93549604 0.93134666 0.93662769 0.93587326 0.93853695]
 [0.95662014 0.95473406 0.96152395 0.95473406 0.95852187]
 [0.97736703 0.97623538 0.97548095 0.97548095 0.97435897]
 [0.9649189  0.9626556  0.96831384 0.97925311 0.9754902 ]
 [0.95624293 0.94417201 0.93813655 0.94379479 0.96153846]
 [0.94832139 0.95020747 0.92870615 0.94794417 0.9494721 ]
 [0.94945304 0.95209355 0.92870615 0.94228593 0.94155354]
 [0.94907582 0.92606564 0.93247831 0.93096945 0.89555053]]
val scores: [[0.81900452 0.78431373 0.76319759 0.80542986 0.76586103]
 [0.79487179 0.84615385 0.86425339 0.82956259 0.84743202]
 [0.84464555 0.88235294 0.88386124 0.86425339 0.85649547]
 [0.87631976 0.88386124 0.87782805 0.88386124 0.87462236]
 [0.8627451  0.87631976 0.86576169 0.87782805 0.8776435 ]
 [0.85218703 0.8627451  0.85067873 0.88084465



In [38]:
#print(train_scores.shape)
avg_train_scores = np.mean(train_scores,axis=1)
avg_val_scores = np.mean(val_scores,axis=1)
print('average train scores :',avg_train_scores)
print('average val scores :',avg_val_scores)

average train scores : [0.79616904 0.85002959 0.89853631 0.93557612 0.95722682 0.97578466
 0.97012633 0.94877695 0.94493025 0.94281844 0.92682795]
average val scores : [0.78756135 0.83645473 0.86632172 0.87929853 0.87205962 0.86300711
 0.83735652 0.81623354 0.81713579 0.80929037 0.80445745]


In [39]:
#best alpha

best_alpha = param_range[np.argmax(avg_val_scores)]
print('best alpha from 5 fold CV:',best_alpha)

best alpha from 5 fold CV: 0.01


In [40]:
linersvc_l2 =LinearSVC(penalty='l2',C = best_alpha)

linersvc_l2.fit(X_train, y_train)

y_pred_train =linersvc_l2.predict(X_train)
y_pred_test=linersvc_l2.predict(X_test)
metrics(X_train,y_train,X_test,y_test,y_pred_train,y_pred_test)

### $\Large\textbf{Part(d(vii))}$

In [41]:
pipeline = make_pipeline(DecisionTreeClassifier())
param_range = np.linspace(0,0.5,30)
train_scores, val_scores = validation_curve(estimator=pipeline,
                                             X=X_train, y=y_train,
                                             cv=5,
                                              param_name='decisiontreeclassifier__min_weight_fraction_leaf', param_range=param_range)
print('train scores:',train_scores)
print('val scores:',val_scores)

print('Printing more details of scores for each alpha:')
for i in range(len(param_range)):
  print('alpha:', param_range[i])
  print('train scores:', train_scores[i])
  print('val scores:', val_scores[i])
  print('**************************')

train scores: [[0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.85250849 0.84534138 0.85326292 0.84081479 0.83446456]
 [0.8200679  0.81252358 0.83289325 0.81327801 0.80656109]
 [0.79818936 0.78800453 0.80045266 0.79102226 0.78431373]
 [0.77065258 0.76650321 0.78159185 0.77442475 0.76432881]
 [0.76461713 0.75480951 0.75405507 0.76650321 0.76093514]
 [0.75141456 0.74311581 0.73255375 0.74726518 0.74208145]
 [0.75254621 0.74311581 0.72991324 0.74726518 0.73491704]
 [0.71859676 0.74311581 0.72727273 0.74726518 0.73491704]
 [0.73330819 0.73745756 0.72727273 0.70992078 0.73491704]
 [0.73330819 0.73745756 0.72727273 0.72727273 0.73491704]
 [0.72236892 0.73330819 0.72387778 0.71708789 0.72926094]
 [0.72274613 0.72274613 0.70614862 0.71633346 0.70927602]
 [0.71859676 0.7125613  0.70614862 0.70992078 0.70927602]
 [0.71859676 0.7125613  0.70614862 0.70992078 0.70927602]
 [0.71859676 0.7125613  0.70614862 0.70992078 0.70927602]
 [0.71859676 0.7125613  0.70614862 0.70992078 0.70927602]


In [42]:
best_alpha = param_range[np.argmax(avg_val_scores)]
print('best_hyper_parameter from 5 fold CV:',best_alpha)
best_hyper_parameter.append(best_alpha)

best_hyper_parameter from 5 fold CV: 0.05172413793103448


In [43]:
decisiontreeclassifier =DecisionTreeClassifier(min_weight_fraction_leaf=best_alpha)

decisiontreeclassifier.fit(X_train, y_train)
y_pred_train =decisiontreeclassifier.predict(X_train)
y_pred_test=decisiontreeclassifier.predict(X_test)
metrics(X_train,y_train,X_test,y_test,y_pred_train,y_pred_test)


In [44]:
# model3= SVC(probability=True)

# #creating a dictionary of hyper parameters
# parameters=[{"C":[1,10,100,1000], 'penalty': ['l1']}]

# #finding the best score and best hyper parameters using 5 fold cross validation and gridsearchCV for 3 performace measures
# for scr in ['accuracy', 'precision_score', 'recall_score']:
#   grid_search=GridSearchCV(estimator=model3,param_grid=parameters,scoring=scr,cv=5,n_jobs=-1)
#   grid_search=grid_search.fit(X,y)
#   print(f"\nFor metrics: {scr}")
#   print(f"The best score is = {grid_search.best_score_} and best parameters are= {grid_search.best_params_}");

### $\Large\textbf{Part(d(vi))}$

In [45]:
pipeline = make_pipeline(KNeighborsClassifier())
param_range = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
train_scores, val_scores = validation_curve(estimator=pipeline,
                                             X=X_train, y=y_train,
                                             cv=5,
                                              param_name='kneighborsclassifier__n_neighbors', param_range=param_range)
print('train scores:',train_scores)
print('val scores:',val_scores)

print('Printing more details of scores for each alpha:')
for i in range(len(param_range)):
  print('alpha:', param_range[i])
  print('train scores:', train_scores[i])
  print('val scores:', val_scores[i])
  print('**************************')

train scores: [[0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.92342512 0.92078461 0.92417955 0.93059223 0.93514329]
 [0.93021501 0.92870615 0.93172388 0.93096945 0.9321267 ]
 [0.91059977 0.90267823 0.90871369 0.90984534 0.90686275]
 [0.9041871  0.9064504  0.90795926 0.89928329 0.91025641]
 [0.89287061 0.89437948 0.89890607 0.89437948 0.89253394]
 [0.8879668  0.88947567 0.89400226 0.89098453 0.89894419]
 [0.88268578 0.88155413 0.88419464 0.8879668  0.88951735]
 [0.87740475 0.88193135 0.88909845 0.88381743 0.88159879]
 [0.87136929 0.87966805 0.87815919 0.88457186 0.87933635]
 [0.86797435 0.88532629 0.87891362 0.88230856 0.87594268]
 [0.86948321 0.8785364  0.87250094 0.88570351 0.87518854]
 [0.8645794  0.87400981 0.87815919 0.87589589 0.86802413]
 [0.86721992 0.869106   0.87023765 0.87589589 0.8714178 ]
 [0.86382497 0.86721992 0.87514146 0.87589589 0.8627451 ]
 [0.86759713 0.86759713 0.87023765 0.87212373 0.86651584]
 [0.86646548 0.86571105 0.87061486 0.86721992 0.86463047]


In [46]:
#print(train_scores.shape)
avg_train_scores = np.mean(train_scores,axis=1)
avg_val_scores = np.mean(val_scores,axis=1)
print('average train scores :',avg_train_scores)
print('average val scores :',avg_val_scores)

average train scores : [0.99947193 0.92682496 0.93074824 0.90773996 0.90562729 0.89461392
 0.89227469 0.88518374 0.88277015 0.87862095 0.8780931  0.87628252
 0.87213368 0.87077545 0.86896547 0.8688143  0.86692836 0.86557035
 0.86443904 0.86330725]
average val scores : [0.85033789 0.82137041 0.85576638 0.8400751  0.8470128  0.83524901
 0.84399165 0.83977025 0.83886436 0.83343586 0.83252997 0.82740177
 0.82709874 0.82770525 0.83283254 0.83102396 0.83524445 0.83464341
 0.83735515 0.83192483]


In [47]:
#best alpha
best_alpha = param_range[np.argmax(avg_val_scores)]
print('best alpha from 10 fold CV:',best_alpha)

best alpha from 10 fold CV: 3


In [48]:
knn =KNeighborsClassifier(n_neighbors=best_alpha)

knn.fit(X_train, y_train)
y_pred_train =knn.predict(X_train)
y_pred_test=knn.predict(X_test)
metrics(X_train,y_train,X_test,y_test,y_pred_train,y_pred_test)


### $\Large\textbf{Part(d(ii))}$

In [49]:
pipeline = make_pipeline(LogisticRegression(solver='saga', penalty='l1', random_state=1))
train_scores, val_scores = validation_curve(estimator=pipeline, X=X_train, y=y_train, cv=5, param_name='logisticregression__C', param_range=param_range)
print('train scores:',train_scores)
print('val scores:',val_scores)

print('Printing more details of scores for each alpha:')
for i in range(len(param_range)):
  print('alpha:', param_range[i])
  print('train scores:', train_scores[i])
  print('val scores:', val_scores[i])
  print('**************************')



train scores: [[0.93323274 0.93210109 0.93247831 0.93511882 0.9392911 ]
 [0.93436439 0.93360996 0.93625047 0.93738212 0.94042232]
 [0.93587326 0.93436439 0.93813655 0.93775934 0.94193062]
 [0.93549604 0.93474161 0.9370049  0.93775934 0.94230769]
 [0.93511882 0.93436439 0.9370049  0.93775934 0.94306184]
 [0.93511882 0.93511882 0.9370049  0.93775934 0.94306184]
 [0.93511882 0.93511882 0.9370049  0.93775934 0.94306184]
 [0.93549604 0.93511882 0.9370049  0.93775934 0.94268477]
 [0.93511882 0.93511882 0.9370049  0.93775934 0.94193062]
 [0.93511882 0.93511882 0.9370049  0.93775934 0.94193062]
 [0.93511882 0.93511882 0.9370049  0.93813655 0.94230769]
 [0.93511882 0.93511882 0.9370049  0.93813655 0.94230769]
 [0.93511882 0.93511882 0.9370049  0.93813655 0.94230769]
 [0.93511882 0.93511882 0.9370049  0.93813655 0.94230769]
 [0.93511882 0.93511882 0.9370049  0.93813655 0.94230769]
 [0.93511882 0.93511882 0.9370049  0.93813655 0.94230769]
 [0.93511882 0.93511882 0.9370049  0.93813655 0.94230769]




In [50]:
#print(train_scores.shape)
avg_train_scores = np.mean(train_scores,axis=1)
avg_val_scores = np.mean(val_scores,axis=1)
print('average train scores :',avg_train_scores)
print('average val scores :',avg_val_scores)

average train scores : [0.93444441 0.93640585 0.93761283 0.93746192 0.93746186 0.93761275
 0.93761275 0.93761277 0.9373865  0.9373865  0.93753736 0.93753736
 0.93753736 0.93753736 0.93753736 0.93753736 0.93753736 0.93753736
 0.93753736 0.93753736]
average val scores : [0.88231512 0.88412553 0.88533353 0.88563565 0.88533399 0.88533399
 0.88503187 0.88503187 0.88503187 0.88533353 0.88533353 0.88533353
 0.88533353 0.88533353 0.88503187 0.88503187 0.88503187 0.88533399
 0.88533399 0.88533399]


In [51]:
#best alpha

best_alpha = param_range[np.argmax(avg_val_scores)]
print('best alpha from 5 fold CV:',best_alpha)

best alpha from 5 fold CV: 4


In [52]:
clf = LogisticRegression(solver='lbfgs', penalty='l2', C = best_alpha, random_state=1)

clf.fit(X_train, y_train)
y_pred_train =clf.predict(X_train)
y_pred_test=clf.predict(X_test)
metrics(X_train,y_train,X_test,y_test,y_pred_train,y_pred_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### $\Large\textbf{Part(d(v))}$

In [53]:
pipeline = make_pipeline(SVC(kernel='rbf'))
param_range = [1e-2,1e-1,1,10,100,1000]
train_scores, val_scores = validation_curve(estimator=pipeline,
                                             X=X_train, y=y_train,
                                             cv=5,
                                              param_name='svc__C', param_range=param_range)
print('train scores:',train_scores)
print('val scores:',val_scores)

print('Printing more details of scores for each alpha:')
for i in range(len(param_range)):
  print('alpha:', param_range[i])
  print('train scores:', train_scores[i])
  print('val scores:', val_scores[i])
  print('**************************')

train scores: [[0.5397963  0.53941909 0.53941909 0.53941909 0.53959276]
 [0.82648057 0.81403244 0.80724255 0.81214636 0.81221719]
 [0.92606564 0.9275745  0.92946058 0.92682007 0.92948718]
 [0.97019992 0.97095436 0.97170879 0.97057714 0.97473605]
 [0.98528857 0.98302527 0.98566579 0.98528857 0.98604827]
 [0.99471897 0.99547341 0.99622784 0.99585062 0.99585219]]
val scores: [[0.53846154 0.53996983 0.53996983 0.53996983 0.53927492]
 [0.76772247 0.79638009 0.82503771 0.81447964 0.79154079]
 [0.8627451  0.89140271 0.87330317 0.87330317 0.87009063]
 [0.88989442 0.8974359  0.87631976 0.89140271 0.88217523]
 [0.87330317 0.87631976 0.87028658 0.88989442 0.86858006]
 [0.85822021 0.86726998 0.85822021 0.87933635 0.85800604]]
Printing more details of scores for each alpha:
alpha: 0.01
train scores: [0.5397963  0.53941909 0.53941909 0.53941909 0.53959276]
val scores: [0.53846154 0.53996983 0.53996983 0.53996983 0.53927492]
**************************
alpha: 0.1
train scores: [0.82648057 0.81403244 0

In [54]:
#print(train_scores.shape)
avg_train_scores = np.mean(train_scores,axis=1)
avg_val_scores = np.mean(val_scores,axis=1)
print('average train scores :',avg_train_scores)
print('average val scores :',avg_val_scores)

average train scores : [0.53952926 0.81442382 0.92788159 0.97163525 0.98506329 0.99562461]
average val scores : [0.53952919 0.79903214 0.87416896 0.8874456  0.8756768  0.86421056]


In [55]:
#best alpha

best_alpha = param_range[np.argmax(avg_val_scores)]
print('best alpha from 10 fold CV:',best_alpha)

best alpha from 10 fold CV: 10


In [56]:
svc_kernel =SVC(kernel='rbf', C = best_alpha)

svc_kernel.fit(X_train, y_train)

y_pred_train =svc_kernel.predict(X_train)
y_pred_test=svc_kernel.predict(X_test)
metrics(X_train,y_train,X_test,y_test,y_pred_train,y_pred_test)

### $\Large\textbf{Part(d(viii))}$

In [57]:
pipeline = make_pipeline(RandomForestClassifier())
param_range = [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,400,450,500,550,600,650,700,750,800,850,900,950,1000,1050,1100,1150,1200,1300,1400,1500,1600,1700,1800,1900,2000,2500,3000,3500,4000,4500,5000]
train_scores, val_scores = validation_curve(estimator=pipeline,
                                             X=X_train, y=y_train,
                                             cv=5,
                                              param_name='randomforestclassifier__n_estimators', param_range=param_range)
print('train scores:',train_scores)
print('val scores:',val_scores)

print('Printing more details of scores for each alpha:')
for i in range(len(param_range)):
  print('alpha:', param_range[i])
  print('train scores:', train_scores[i])
  print('val scores:', val_scores[i])
  print('**************************')

train scores: [[0.99396454 0.99396454 0.99547341 0.99585062 0.99660633]
 [0.99849114 0.99849114 0.99849114 0.99962278 0.99924585]
 [0.99886835 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]
 [0.99924557 0.99924557 0.99962278 0.99962278 0.99962293]


In [58]:
avg_train_scores = np.mean(train_scores,axis=1)
avg_val_scores = np.mean(val_scores,axis=1)
print('average train scores :',avg_train_scores)
print('average val scores :',avg_val_scores)

average train scores : [0.99517189 0.99886841 0.99939648 0.99947193 0.99947193 0.99947193
 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193
 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193
 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193
 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193
 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193
 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193
 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193
 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193
 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193
 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193 0.99947193]
average val scores : [0.82257613 0.83917604 0.85274934 0.85999098 0.86059384 0.86210715
 0.85486687 0.86572432 0.86149699 0.86814079 0.86874046 0.87085709
 0.87085253 0.87447426 0.87327081 0.8714604  0.87296597 0.86753428
 0.87537787 0.870

In [59]:
best_alpha = param_range[np.argmax(avg_val_scores)]
print('best_hyper_parameter from 5 fold CV:',best_alpha)
best_hyper_parameter.append(best_alpha)

best_hyper_parameter from 5 fold CV: 4500


In [60]:
randomforest =RandomForestClassifier(n_estimators=best_alpha)

randomforest.fit(X_train, y_train)
y_pred_train =randomforest.predict(X_train)
y_pred_test=randomforest.predict(X_test)
metrics(X_train,y_train,X_test,y_test,y_pred_train,y_pred_test)

# *Sequence in which I ran the parts of (d): (i), (iii), (vii), (vi), (ii), (v), (viii), (iv) and the metrics are stored in the list in this sequence only.*

### $\Large\textbf{Part(e)}$

In [61]:
from tabulate import tabulate

In [62]:
#creating table for the chi-square test data
all_data = [['Accuracy_train', 'Precision_train', 'Recall_train', 'Specificity_train', 'Sensitivity_train', 'Accuracy_test', 'Precision_test', 'Recall_test', 'Specificity_test', 'Sensitivity_test']]
for i in  range(len(accuracy_train)):
    all_data.append([accuracy_train[i] , precision_train[i] , recall_train[i] , specifity_train[i], senstivity_train[i],  accuracy_test[i] , precision_test[i] ,  recall_test[i] , specifity_test[i], senstivity_test[i]])
#all_data.append(['Total', '100', '100', '13.2'])    
print(tabulate(all_data, headers = "firstrow",tablefmt="grid"))

+------------------+-------------------+----------------+---------------------+---------------------+-----------------+------------------+---------------+--------------------+--------------------+
|   Accuracy_train |   Precision_train |   Recall_train |   Specificity_train |   Sensitivity_train |   Accuracy_test |   Precision_test |   Recall_test |   Specificity_test |   Sensitivity_test |
|         0.933011 |          0.920516 |       0.958613 |            0.903014 |            0.958613 |        0.887817 |         0.869663 |      0.917062 |           0.857494 |           0.917062 |
+------------------+-------------------+----------------+---------------------+---------------------+-----------------+------------------+---------------+--------------------+--------------------+
|         0.931201 |          0.917559 |       0.958613 |            0.899083 |            0.958613 |        0.881785 |         0.864865 |      0.909953 |           0.85258  |           0.909953 |
+--------------

**Observations:**

For training data:

* Every metric is highest for random forest model.
* All the metrics for decision tree coming out to be the lowest
* Recall is same for Logistic with L2 regularizer and SVM with L2 regularizer.

For testing data:

* Accuracy is almost same for Logistic and SVM with L2 regularizers, random forest and kernel SVM with rbf kernel.
* Precision is highest for Logistic with L1 regularizer and lowest for decision tree.
* Recall is highest for random forest and almost same for Logistic with L2 regularizer and SVM with L2 regularizer.
* Sensitivity is highest for random forest.
* Specificity is almost same for Logistic with L2 regularizer and SVM with L2 and kernel SVM with rbf kernel and highest for Logistic with L1 regularizer.

### $\Large\textbf{Part(f)}$

In logistic regression and SVM, L1 regularization can lead to sparse models, where many of the features are assigned zero weights, effectively reducing the number of features used in the model. This sparsity can be desirable in some cases, such as when dealing with high-dimensional data with many irrelevant features. L2 regularization, on the other hand, tends to distribute the weight values more evenly across all the features, resulting in dense models that use all the available features.

### $\Large\textbf{Part(d(iv))}$

In [63]:
param = [1,10,100,1000]

In [64]:
pipeline = make_pipeline(LinearSVC(penalty='l1'))
train_scores, val_scores = validation_curve(estimator=pipeline, X=X_train, y=y_train, cv=5, param_name='linearsvc__C', param_range=param)
print('train scores:',train_scores)
print('val scores:',val_scores)

print('Printing more details of scores for each alpha:')
for i in range(len(param)):
  print('alpha:', param[i])
  print('train scores:', train_scores[i])
  print('val scores:', val_scores[i])
  print('**************************')

train scores: [[nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]]
val scores: [[nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]]
Printing more details of scores for each alpha:
alpha: 1
train scores: [nan nan nan nan nan]
val scores: [nan nan nan nan nan]
**************************
alpha: 10
train scores: [nan nan nan nan nan]
val scores: [nan nan nan nan nan]
**************************
alpha: 100
train scores: [nan nan nan nan nan]
val scores: [nan nan nan nan nan]
**************************
alpha: 1000
train scores: [nan nan nan nan nan]
val scores: [nan nan nan nan nan]
**************************


In [65]:
#print(train_scores.shape)
avg_train_scores = np.mean(train_scores,axis=1)
avg_val_scores = np.mean(val_scores,axis=1)
print('average train scores :',avg_train_scores)
print('average val scores :',avg_val_scores)

average train scores : [nan nan nan nan]
average val scores : [nan nan nan nan]


In [66]:
#best alpha

best_alpha = param_range[np.argmax(avg_val_scores)]
print('best alpha from 5 fold CV:',best_alpha)

best alpha from 5 fold CV: 10


In [67]:
linersvc_l1 =LinearSVC(penalty='l1',C = best_alpha)

linersvc_l1.fit(X_train, y_train)

y_pred_train =linersvc_l1.predict(X_train)
y_pred_test=linersvc_l1.predict(X_test)
metrics(X_train,y_train,X_test,y_test,y_pred_train,y_pred_test)

ValueError: ignored

Soft margin SVM wiht L1 regularizers ended up with nan values.