In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
%matplotlib inline

### Data Loading

In [2]:
data=pd.read_csv("csv_result-KDDTrain+.csv")
data

# https://github.com/jmnwong/NSL-KDD-Dataset

Unnamed: 0,id,'duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent',...,'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','class'
0,1,0,tcp,ftp_data,SF,491,0,0,0,0,...,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,2,0,udp,other,SF,146,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,3,0,tcp,private,S0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
3,4,0,tcp,http,SF,232,8153,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,5,0,tcp,http,SF,199,420,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,125969,0,tcp,private,S0,0,0,0,0,0,...,25,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
125969,125970,8,udp,private,SF,105,145,0,0,0,...,244,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal
125970,125971,0,tcp,smtp,SF,2231,384,0,0,0,...,30,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal
125971,125972,0,tcp,klogin,S0,0,0,0,0,0,...,8,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly


In [3]:
data.columns.to_list()

['id',
 "'duration'",
 "'protocol_type'",
 "'service'",
 "'flag'",
 "'src_bytes'",
 "'dst_bytes'",
 "'land'",
 "'wrong_fragment'",
 "'urgent'",
 "'hot'",
 "'num_failed_logins'",
 "'logged_in'",
 "'num_compromised'",
 "'root_shell'",
 "'su_attempted'",
 "'num_root'",
 "'num_file_creations'",
 "'num_shells'",
 "'num_access_files'",
 "'num_outbound_cmds'",
 "'is_host_login'",
 "'is_guest_login'",
 "'count'",
 "'srv_count'",
 "'serror_rate'",
 "'srv_serror_rate'",
 "'rerror_rate'",
 "'srv_rerror_rate'",
 "'same_srv_rate'",
 "'diff_srv_rate'",
 "'srv_diff_host_rate'",
 "'dst_host_count'",
 "'dst_host_srv_count'",
 "'dst_host_same_srv_rate'",
 "'dst_host_diff_srv_rate'",
 "'dst_host_same_src_port_rate'",
 "'dst_host_srv_diff_host_rate'",
 "'dst_host_serror_rate'",
 "'dst_host_srv_serror_rate'",
 "'dst_host_rerror_rate'",
 "'dst_host_srv_rerror_rate'",
 "'class'"]

In [4]:
data.dtypes

# we need to one-hot encode 'protocol_type', 'service' , 'flag' , 'class'

id                                 int64
'duration'                         int64
'protocol_type'                   object
'service'                         object
'flag'                            object
'src_bytes'                        int64
'dst_bytes'                        int64
'land'                             int64
'wrong_fragment'                   int64
'urgent'                           int64
'hot'                              int64
'num_failed_logins'                int64
'logged_in'                        int64
'num_compromised'                  int64
'root_shell'                       int64
'su_attempted'                     int64
'num_root'                         int64
'num_file_creations'               int64
'num_shells'                       int64
'num_access_files'                 int64
'num_outbound_cmds'                int64
'is_host_login'                    int64
'is_guest_login'                   int64
'count'                            int64
'srv_count'     

In [5]:
data["'protocol_type'"].value_counts() # nominal attribute hence one-hot

tcp     102689
udp      14993
icmp      8291
Name: 'protocol_type', dtype: int64

In [6]:
data["'service'"].value_counts() 

http         40338
private      21853
domain_u      9043
smtp          7313
ftp_data      6860
             ...  
tftp_u           3
aol              2
harvest          2
http_8001        2
http_2784        1
Name: 'service', Length: 70, dtype: int64

In [7]:
data["'flag'"].value_counts() 

SF        74945
S0        34851
REJ       11233
RSTR       2421
RSTO       1562
S1          365
SH          271
S2          127
RSTOS0      103
S3           49
OTH          46
Name: 'flag', dtype: int64

In [8]:
data["'class'"].value_counts() 

normal     67343
anomaly    58630
Name: 'class', dtype: int64

In [9]:
X=data.drop("'class'",axis=1)

In [10]:
X

Unnamed: 0,id,'duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent',...,'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate'
0,1,0,tcp,ftp_data,SF,491,0,0,0,0,...,150,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00
1,2,0,udp,other,SF,146,0,0,0,0,...,255,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00
2,3,0,tcp,private,S0,0,0,0,0,0,...,255,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00
3,4,0,tcp,http,SF,232,8153,0,0,0,...,30,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01
4,5,0,tcp,http,SF,199,420,0,0,0,...,255,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,125969,0,tcp,private,S0,0,0,0,0,0,...,255,25,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00
125969,125970,8,udp,private,SF,105,145,0,0,0,...,255,244,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00
125970,125971,0,tcp,smtp,SF,2231,384,0,0,0,...,255,30,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00
125971,125972,0,tcp,klogin,S0,0,0,0,0,0,...,255,8,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00


In [11]:
for item in data.columns.to_list():
    print("######    "+item+"    ######\n\n")
    print(data[item].value_counts())
    print("\n\n#####################################\n\n")

######    id    ######


2049     1
6870     1
2772     1
13011    1
15058    1
        ..
1322     1
7465     1
5416     1
27943    1
2047     1
Name: id, Length: 125973, dtype: int64


#####################################


######    'duration'    ######


0        115955
1          1989
2           843
3           557
4           351
          ...  
23132         1
2774          1
727           1
13073         1
6141          1
Name: 'duration', Length: 2981, dtype: int64


#####################################


######    'protocol_type'    ######


tcp     102689
udp      14993
icmp      8291
Name: 'protocol_type', dtype: int64


#####################################


######    'service'    ######


http         40338
private      21853
domain_u      9043
smtp          7313
ftp_data      6860
             ...  
tftp_u           3
aol              2
harvest          2
http_8001        2
http_2784        1
Name: 'service', Length: 70, dtype: int64


###############################

In [12]:
# remove 'land' 'num_shells' 'num_outbound_cmds' 'is_host_login'  columns

# highly imbalanced data distribution and not much relevance to class

X = X.drop("'land'",axis=1)
X = X.drop("'num_shells'",axis=1)
X = X.drop("'num_outbound_cmds'",axis=1)
X = X.drop("'is_host_login'",axis=1)
X = X.drop('id',axis=1)


In [13]:
Y=data["'class'"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [15]:
X_train

Unnamed: 0,'duration','protocol_type','service','flag','src_bytes','dst_bytes','wrong_fragment','urgent','hot','num_failed_logins',...,'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate'
18593,41476,tcp,private,RSTR,1,0,0,0,0,0,...,246,2,0.01,0.26,0.51,0.00,0.00,0.00,0.51,1.0
98800,0,icmp,eco_i,SF,8,0,0,0,0,0,...,2,50,1.00,0.00,1.00,0.50,0.00,0.00,0.00,0.0
60094,0,tcp,klogin,S0,0,0,0,0,0,0,...,255,6,0.02,0.06,0.00,0.00,1.00,1.00,0.00,0.0
42119,510,udp,other,SF,145,105,0,0,0,0,...,255,1,0.00,0.54,0.91,0.00,0.00,0.00,0.00,0.0
82618,0,tcp,http,SF,241,11683,0,0,0,0,...,255,255,1.00,0.00,0.00,0.00,0.01,0.01,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,0,tcp,echo,RSTO,0,0,0,0,0,0,...,255,4,0.02,0.09,0.00,0.00,0.00,0.00,1.00,1.0
103694,0,tcp,telnet,S0,0,0,0,0,0,0,...,255,4,0.02,0.07,0.01,0.00,1.00,1.00,0.00,0.0
860,0,tcp,http,REJ,0,0,0,0,0,0,...,255,6,0.02,0.07,0.00,0.00,0.00,0.00,1.00,1.0
15795,0,tcp,http,SF,309,4281,0,0,0,0,...,21,255,1.00,0.00,0.05,0.05,0.00,0.00,0.00,0.0


In [16]:
categorical_cols  = ["'num_failed_logins'", "'logged_in'", "'root_shell'", "'su_attempted'", "'num_root'", "'urgent'", 
                     "'is_guest_login'", "'wrong_fragment'", "'num_file_creations'", "'num_access_files'", "'num_compromised'"]

In [17]:
X_train.columns

Index([''duration'', ''protocol_type'', ''service'', ''flag'', ''src_bytes'',
       ''dst_bytes'', ''wrong_fragment'', ''urgent'', ''hot'',
       ''num_failed_logins'', ''logged_in'', ''num_compromised'',
       ''root_shell'', ''su_attempted'', ''num_root'', ''num_file_creations'',
       ''num_access_files'', ''is_guest_login'', ''count'', ''srv_count'',
       ''serror_rate'', ''srv_serror_rate'', ''rerror_rate'',
       ''srv_rerror_rate'', ''same_srv_rate'', ''diff_srv_rate'',
       ''srv_diff_host_rate'', ''dst_host_count'', ''dst_host_srv_count'',
       ''dst_host_same_srv_rate'', ''dst_host_diff_srv_rate'',
       ''dst_host_same_src_port_rate'', ''dst_host_srv_diff_host_rate'',
       ''dst_host_serror_rate'', ''dst_host_srv_serror_rate'',
       ''dst_host_rerror_rate'', ''dst_host_srv_rerror_rate''],
      dtype='object')

In [18]:
# Columns to standard scale 

X_train_cols = X_train.columns.to_list()

cols_to_scale = list(filter(lambda x: x not in categorical_cols, X_train_cols))

### Preprocessing

In [19]:
# What all are categorical features?
# What all are continuous?


In [20]:
# remove these categorical features from the list of columns to scale, we will encode them separately
cols_to_scale.remove("'flag'")
cols_to_scale.remove("'protocol_type'")
cols_to_scale.remove("'service'")

### Feature Scaling ( Continuous attributes )

In [26]:
from sklearn.preprocessing import StandardScaler
scaled_features=X_train.copy()

scaled_features=scaled_features.drop(cols_to_one_hot_encode, axis=1)

features=scaled_features[cols_to_scale]
features

Unnamed: 0,'duration','src_bytes','dst_bytes','hot','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate',...,'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate'
18593,41476,1,0,0,2,2,0.0,0.0,1.0,1.0,...,246,2,0.01,0.26,0.51,0.00,0.00,0.00,0.51,1.0
98800,0,8,0,0,1,14,0.0,0.0,0.0,0.0,...,2,50,1.00,0.00,1.00,0.50,0.00,0.00,0.00,0.0
60094,0,0,0,0,145,12,1.0,1.0,0.0,0.0,...,255,6,0.02,0.06,0.00,0.00,1.00,1.00,0.00,0.0
42119,510,145,105,0,1,1,0.0,0.0,0.0,0.0,...,255,1,0.00,0.54,0.91,0.00,0.00,0.00,0.00,0.0
82618,0,241,11683,0,4,4,0.0,0.0,0.0,0.0,...,255,255,1.00,0.00,0.00,0.00,0.01,0.01,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,0,0,0,0,258,4,0.0,0.0,1.0,1.0,...,255,4,0.02,0.09,0.00,0.00,0.00,0.00,1.00,1.0
103694,0,0,0,0,24,4,1.0,1.0,0.0,0.0,...,255,4,0.02,0.07,0.01,0.00,1.00,1.00,0.00,0.0
860,0,0,0,0,258,6,0.0,0.0,1.0,1.0,...,255,6,0.02,0.07,0.00,0.00,0.00,0.00,1.00,1.0
15795,0,309,4281,0,5,5,0.0,0.0,0.0,0.0,...,21,255,1.00,0.00,0.05,0.05,0.00,0.00,0.00,0.0


In [27]:
scaler=StandardScaler().fit(features.values)
features=scaler.transform(features.values)

In [28]:
features

array([[ 1.58642664e+01, -7.23107531e-03, -5.70042220e-03, ...,
        -6.24591651e-01,  1.27591758e+00,  2.75243669e+00],
       [-1.09548529e-01, -7.22998264e-03, -5.70042220e-03, ...,
        -6.24591651e-01, -3.87975904e-01, -3.76775272e-01],
       [-1.09548529e-01, -7.23123141e-03, -5.70042220e-03, ...,
         1.61950482e+00, -3.87975904e-01, -3.76775272e-01],
       ...,
       [-1.09548529e-01, -7.23123141e-03, -5.70042220e-03, ...,
        -6.24591651e-01,  2.87456034e+00,  2.75243669e+00],
       [-1.09548529e-01, -7.18299759e-03, -4.82896076e-03, ...,
        -6.24591651e-01, -3.87975904e-01, -3.76775272e-01],
       [-1.09548529e-01, -7.22451926e-03, -5.68617266e-03, ...,
        -6.24591651e-01, -3.87975904e-01, -3.76775272e-01]])

In [29]:
cols_to_scale

["'duration'",
 "'src_bytes'",
 "'dst_bytes'",
 "'hot'",
 "'count'",
 "'srv_count'",
 "'serror_rate'",
 "'srv_serror_rate'",
 "'rerror_rate'",
 "'srv_rerror_rate'",
 "'same_srv_rate'",
 "'diff_srv_rate'",
 "'srv_diff_host_rate'",
 "'dst_host_count'",
 "'dst_host_srv_count'",
 "'dst_host_same_srv_rate'",
 "'dst_host_diff_srv_rate'",
 "'dst_host_same_src_port_rate'",
 "'dst_host_srv_diff_host_rate'",
 "'dst_host_serror_rate'",
 "'dst_host_srv_serror_rate'",
 "'dst_host_rerror_rate'",
 "'dst_host_srv_rerror_rate'"]

In [30]:
scaled_features[cols_to_scale]=features
scaled_features

Unnamed: 0,'duration','src_bytes','dst_bytes','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell',...,'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate'
18593,15.864266,-0.007231,-0.005700,0,0,-0.094085,0,0,0,0,...,0.641988,-1.028222,-1.138794,0.943221,1.180780,-0.288058,-0.639300,-0.624592,1.275918,2.752437
98800,-0.109549,-0.007230,-0.005700,0,0,-0.094085,0,0,0,0,...,-1.822315,-0.594724,1.066119,-0.438730,2.775296,4.156908,-0.639300,-0.624592,-0.387976,-0.376775
60094,-0.109549,-0.007231,-0.005700,0,0,-0.094085,0,0,0,0,...,0.732885,-0.992098,-1.116522,-0.119818,-0.478817,-0.288058,1.608890,1.619505,-0.387976,-0.376775
42119,0.086870,-0.007209,-0.005679,0,0,-0.094085,0,0,0,0,...,0.732885,-1.037254,-1.161066,2.431476,2.482426,-0.288058,-0.639300,-0.624592,-0.387976,-0.376775
82618,-0.109549,-0.007194,-0.003322,0,0,-0.094085,0,1,0,0,...,0.732885,1.256673,1.066119,-0.438730,-0.478817,-0.288058,-0.616818,-0.602151,-0.387976,-0.376775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,-0.109549,-0.007231,-0.005700,0,0,-0.094085,0,0,0,0,...,0.732885,-1.010160,-1.116522,0.039637,-0.478817,-0.288058,-0.639300,-0.624592,2.874560,2.752437
103694,-0.109549,-0.007231,-0.005700,0,0,-0.094085,0,0,0,0,...,0.732885,-1.010160,-1.116522,-0.066667,-0.446276,-0.288058,1.608890,1.619505,-0.387976,-0.376775
860,-0.109549,-0.007231,-0.005700,0,0,-0.094085,0,0,0,0,...,0.732885,-0.992098,-1.116522,-0.066667,-0.478817,-0.288058,-0.639300,-0.624592,2.874560,2.752437
15795,-0.109549,-0.007183,-0.004829,0,0,-0.094085,0,1,0,0,...,-1.630423,1.256673,1.066119,-0.438730,-0.316112,0.156439,-0.639300,-0.624592,-0.387976,-0.376775


### One hot encoding the categorical attributes

In [None]:
clean_data = X_train.copy();
columns_to_drop = [col for col in clean_data.columns if col not in cols_to_one_hot_encode]
clean_data = clean_data.drop(columns_to_drop, axis =1)

In [None]:
clean_data

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

cols_to_one_hot_encode = ["'protocol_type'", "'service'" , "'flag'"]

column_trans=make_column_transformer(
(OneHotEncoder(),cols_to_one_hot_encode),remainder='passthrough')

In [None]:
# concatenating back the categorical features and the standard-scaled continuous ones
result = pd.concat([scaled_features, clean_data], axis=1)

In [33]:
# applying one hot encoding on the final concate'd data
clean_data=column_trans.fit_transform(result)

In [36]:
clean_data.shape

(84401, 115)

In [40]:
y_train = pd.DataFrame(y_train)

In [41]:
y_train

Unnamed: 0,'class'
18593,anomaly
98800,anomaly
60094,anomaly
42119,normal
82618,normal
...,...
119879,anomaly
103694,anomaly
860,anomaly
15795,normal


In [42]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train["'class'"] = label_encoder.fit_transform(y_train["'class'"])

print(y_train)

        'class'
18593         0
98800         0
60094         0
42119         1
82618         1
...         ...
119879        0
103694        0
860           0
15795         1
121958        1

[84401 rows x 1 columns]


In [53]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression(max_iter=1000,random_state=42, solver='liblinear')
ovr_clf=OneVsRestClassifier(logreg)
ovr_clf.fit(clean_data,y_train)

OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000, random_state=42,
                                                 solver='liblinear'))

In [54]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(ovr_clf,clean_data,y_train,scoring="accuracy",cv=10)

In [55]:
print("scores are:",scores)
print("mean:" ,scores.mean())

scores are: [0.97429215 0.97630332 0.97227488 0.97594787 0.97440758 0.97393365
 0.97511848 0.97440758 0.9728673  0.97310427]
mean: 0.9742657074390346


In [392]:
### DO NOT RUN YET!

cols=['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)',
       'Absolute magnitude(Mv)']
temp_xtest=X_test.copy()
test_f=temp_xtest[cols]
feat=scaler.transform(test_f.values)
temp_xtest[cols]=feat
X_test_prep=column_trans.transform(temp_xtest)

In [393]:
# confusion matrix 
# find precision recall FPR etc.
y_pred=ovr_clf.predict(X_test_prep)
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score
print("average=MACRO")
print("f1 score: ",f1_score(y_test,y_pred,average='macro'))
print("precision: ",precision_score(y_test,y_pred,average="macro"))
print("recall: ",recall_score(y_test,y_pred,average='macro'))
print("-----------------------------------------")
print("average=weighted")
print("f1 score: ",f1_score(y_test,y_pred,average='weighted'))
print("precision: ",precision_score(y_test,y_pred,average="weighted"))
print("recall: ",recall_score(y_test,y_pred,average='weighted'))
print("-----------------------------------------")
print("average=micro")
print("f1 score: ",f1_score(y_test,y_pred,average='micro'))
print("precision: ",precision_score(y_test,y_pred,average="micro"))
print("recall: ",recall_score(y_test,y_pred,average='micro'))


average=MACRO
f1 score:  0.9853968253968254
precision:  0.9871794871794872
recall:  0.9848484848484849
-----------------------------------------
average=weighted
f1 score:  0.987452380952381
precision:  0.9884615384615385
recall:  0.9875
-----------------------------------------
average=micro
f1 score:  0.9875
precision:  0.9875
recall:  0.9875


In [394]:
scaled_features

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star color,Spectral Class
97,-0.294108,-0.600672,-0.432522,-0.197138,Y,F
65,-0.773923,-0.600718,-0.434987,1.230758,R,M
212,0.288066,-0.596055,-0.423032,-0.445974,W,A
208,0.811904,-0.600718,-0.435234,0.699064,B,B
142,0.852024,-0.600718,-0.435237,0.807930,B,B
...,...,...,...,...,...,...
106,1.539486,1.547651,-0.306733,-1.000997,B,O
14,-0.843862,-0.600718,-0.434970,0.710922,R,M
92,-0.689128,-0.600718,-0.433634,0.171063,Y,K
179,1.524306,0.869938,1.879185,-1.235253,W,B


In [395]:
# Model Creation SVC
from sklearn.svm import SVC
svm_clf=SVC()
ovr_clf_svc=OneVsRestClassifier(svm_clf)
ovr_clf_svc.fit(clean_data,y_train)

OneVsRestClassifier(estimator=SVC())

In [396]:
# 10 fold cross validation
from sklearn.model_selection import cross_val_score
scores=cross_val_score(ovr_clf_svc,clean_data,y_train,scoring="accuracy",cv=10)
print("scores are:",scores)
print("mean:" ,scores.mean())

scores are: [1.     0.9375 1.     1.     1.     1.     1.     1.     0.875  1.    ]
mean: 0.98125


In [397]:
# confusion matrix 
# find precision recall FPR etc.

# preparing X_test set (Transform the test set, dont fit it)
cols=['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)',
       'Absolute magnitude(Mv)']
temp_xtest=X_test.copy()
test_f=temp_xtest[cols]
feat=scaler.transform(test_f.values)
temp_xtest[cols]=feat
X_test_prep=column_trans.transform(temp_xtest)


# metrics for svc
y_pred=ovr_clf_svc.predict(X_test_prep)
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score
print("---------------------------------------")
print("average=macro")
print("f1 score: ",f1_score(y_test,y_pred,average='macro'))
print("precision: ",precision_score(y_test,y_pred,average="macro"))
print("recall: ",recall_score(y_test,y_pred,average='macro'))
print("-----------------------------------------")
print("average=weighted")
print("f1 score: ",f1_score(y_test,y_pred,average='weighted'))
print("precision: ",precision_score(y_test,y_pred,average="weighted"))
print("recall: ",recall_score(y_test,y_pred,average='weighted'))
print("-----------------------------------------")
print("average=micro")
print("f1 score: ",f1_score(y_test,y_pred,average='micro'))
print("precision: ",precision_score(y_test,y_pred,average="micro"))
print("recall: ",recall_score(y_test,y_pred,average='micro'))


---------------------------------------
average=macro
f1 score:  0.9877031181379007
precision:  0.9901960784313726
recall:  0.9861111111111112
-----------------------------------------
average=weighted
f1 score:  0.9874176548089592
precision:  0.9882352941176471
recall:  0.9875
-----------------------------------------
average=micro
f1 score:  0.9875
precision:  0.9875
recall:  0.9875


## MLP

In [437]:
X_train

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star color,Spectral Class
97,7720,7.92000,1.34000,2.440,Y,F
65,3295,0.00098,0.13200,17.130,R,M
212,13089,788.00000,5.99200,-0.120,W,A
208,17920,0.00111,0.01060,11.660,B,B
142,18290,0.00130,0.00934,12.780,B,B
...,...,...,...,...,...,...
106,24630,363000.00000,63.00000,-5.830,B,O
14,2650,0.00060,0.14000,11.782,R,M
92,4077,0.08500,0.79500,6.228,Y,K
179,24490,248490.00000,1134.50000,-8.240,W,B


In [438]:
clean_data

array([[ 0.        ,  0.        ,  0.        , ..., -0.60067155,
        -0.43252238, -0.19713751],
       [ 0.        ,  1.        ,  0.        , ..., -0.60071842,
        -0.43498677,  1.23075774],
       [ 0.        ,  0.        ,  1.        , ..., -0.59605474,
        -0.42303205, -0.44597426],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.60071792,
        -0.43363421,  0.17106312],
       [ 0.        ,  0.        ,  1.        , ...,  0.86993802,
         1.8791846 , -1.23525333],
       [ 1.        ,  0.        ,  0.        , ...,  0.15091491,
        -0.36181413, -0.99419273]])

In [47]:
Y_train_mlp=pd.DataFrame(y_train)
Y_train_mlp

Unnamed: 0,'class'
18593,0
98800,0
60094,0
42119,1
82618,1
...,...
119879,0
103694,0
860,0
15795,1


In [48]:
# One hot encoding for the outputs
column_trans_y=make_column_transformer(
(OneHotEncoder(sparse=False),["'class'"]),remainder='passthrough')

onehot_y=column_trans_y.fit_transform(Y_train_mlp)

In [49]:
onehot_y.shape

(84401, 2)

In [50]:
clean_data.shape

(84401, 115)

In [51]:
from sklearn.neural_network import MLPClassifier
clf_mlp=MLPClassifier(solver='lbfgs',activation='tanh',alpha=0.001,hidden_layer_sizes=(12,2),random_state=42)
clf_mlp.fit(clean_data,onehot_y)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='tanh', alpha=0.001, hidden_layer_sizes=(12, 2),
              random_state=42, solver='lbfgs')

In [52]:
# 10 fold cross validation
from sklearn.model_selection import cross_val_score
scores=cross_val_score(clf_mlp,clean_data,onehot_y,scoring="accuracy",cv=10)
print("scores are:",scores)
print("mean:" ,scores.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

scores are: [0.98874541 0.98755924 0.98779621 0.98886256 0.98945498 0.98898104
 0.99028436 0.98767773 0.98767773 0.98921801]
mean: 0.9886257257652925


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [445]:
Y_test_mlp=pd.DataFrame(y_test)
column_trans_ytest=make_column_transformer(
(OneHotEncoder(sparse=False),['Star type']),remainder='passthrough')

onehot_y_test=column_trans_ytest.fit_transform(Y_test_mlp)
onehot_y_test.shape

(80, 6)

In [446]:
X_test.shape

(80, 6)

In [447]:
# confusion matrix 
# find precision recall FPR etc.

# preparing X_test set (Transform the test set, dont fit it)
cols=['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)',
       'Absolute magnitude(Mv)']
temp_xtest=X_test.copy()
test_f=temp_xtest[cols]
feat=scaler.transform(test_f.values)
temp_xtest[cols]=feat
temp_xtest

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star color,Spectral Class
24,0.657930,-0.600718,-0.435227,0.721420,W,B
6,-0.845271,-0.600718,-0.434997,1.239506,R,M
93,-0.591214,-0.600716,-0.432951,0.030315,Y,K
109,2.492717,1.482549,-0.298572,-0.997109,B,O
104,0.071960,0.062139,-0.410775,-1.008773,B,O
...,...,...,...,...,...,...
82,-0.162905,-0.600718,-0.435237,0.905132,W,A
5,-0.823259,-0.600718,-0.435032,1.216177,R,M
56,-0.734345,1.547651,2.977753,-1.592956,R,M
164,-0.163230,0.813773,-0.363854,-1.147772,B,O


In [448]:
X_test_prep=column_trans.transform(temp_xtest)
X_test_prep

array([[ 0.        ,  0.        ,  1.        , ..., -0.60071835,
        -0.43522749,  0.72142002],
       [ 0.        ,  1.        ,  0.        , ..., -0.60071842,
        -0.43499697,  1.23950591],
       [ 0.        ,  0.        ,  0.        , ..., -0.60071631,
        -0.43295079,  0.03031483],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  1.54765088,
         2.97775339, -1.59295616],
       [ 1.        ,  0.        ,  0.        , ...,  0.81377266,
        -0.36385418, -1.14777166],
       [ 1.        ,  0.        ,  0.        , ...,  0.6001194 ,
        -0.42126129, -0.8775505 ]])

In [449]:
y_pred=clf_mlp.predict(X_test_prep)
y_pred.shape

(80, 6)

In [450]:
onehot_y_test.shape

(80, 6)

In [451]:

from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score
print("---------------------------------------")
print("average=macro")
print("f1 score: ",f1_score(onehot_y_test,y_pred,average='macro'))
print("precision: ",precision_score(onehot_y_test,y_pred,average="macro"))
print("recall: ",recall_score(onehot_y_test,y_pred,average='macro'))
print("f1 score: ",f1_score(onehot_y_test,y_pred,average='weighted'))
print("precision: ",precision_score(onehot_y_test,y_pred,average="weighted"))
print("recall: ",recall_score(onehot_y_test,y_pred,average='weighted'))

---------------------------------------
average=macro
f1 score:  1.0
precision:  1.0
recall:  1.0
f1 score:  1.0
precision:  1.0
recall:  1.0


In [452]:
# metrics for svc
y_pred=clf_mlp.predict(X_test_prep)
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score
print("---------------------------------------")
print("average=macro")
print("f1 score: ",f1_score(y_test,y_pred,average='macro'))
print("precision: ",precision_score(y_test,y_pred,average="macro"))
print("recall: ",recall_score(y_test,y_pred,average='macro'))
print("-----------------------------------------")
print("average=weighted")
print("f1 score: ",f1_score(y_test,y_pred,average='weighted'))
print("precision: ",precision_score(y_test,y_pred,average="weighted"))
print("recall: ",recall_score(y_test,y_pred,average='weighted'))
print("-----------------------------------------")
print("average=micro")
print("f1 score: ",f1_score(y_test,y_pred,average='micro'))
print("precision: ",precision_score(y_test,y_pred,average="micro"))
print("recall: ",recall_score(y_test,y_pred,average='micro'))


---------------------------------------
average=macro


ValueError: Classification metrics can't handle a mix of multiclass and multilabel-indicator targets