In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder

In [44]:
raw_data = pd.read_csv('crx.csv',
                       header = None,
                       na_values='?') #this will convert '?' to NaN
raw_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-


In [45]:
#continous data
cont_att = [1, 2, 7, 10, 13, 14]

#discrete data
dis_att = []
for i in range(1, 16):
    if i not in cont_att:
        dis_att.append(i)

cont_att, dis_att

([1, 2, 7, 10, 13, 14], [3, 4, 5, 6, 8, 9, 11, 12, 15])

In [5]:
raw_data.isna() #checking for missing values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
686,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
687,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
688,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [4]:
raw_data.isna().sum() #counting missing values

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [46]:
for index in range(15):
    if index in cont_att:
        raw_data[index].fillna(raw_data[index].mean(), inplace = True)
    else:
        raw_data[index].fillna(raw_data[index].mode()[0], inplace = True)
        #filling missing values with mean for continous and mode for discrete

print(raw_data.isna().sum())
        

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


In [47]:
#normalizing the data
def normalize(x):
    return (x - x.min()) / (x.max() - x.min())

# Apply normalization only to numeric columns
for index in cont_att:
    raw_data[index] = normalize(raw_data[index])

#normalizing the data
raw_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,0.256842,0.000000,u,g,w,v,0.043860,t,t,0.014925,f,g,0.1010,0.00000,+
1,a,0.675489,0.159286,u,g,q,h,0.106667,t,t,0.089552,f,g,0.0215,0.00560,+
2,a,0.161654,0.017857,u,g,q,h,0.052632,t,f,0.000000,f,g,0.1400,0.00824,+
3,b,0.211729,0.055000,u,g,w,v,0.131579,t,t,0.074627,t,g,0.0500,0.00003,+
4,b,0.096541,0.200893,u,g,w,v,0.060000,t,f,0.000000,f,s,0.0600,0.00000,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,0.110226,0.360179,y,p,e,h,0.043860,f,f,0.000000,f,g,0.1300,0.00000,-
686,a,0.134135,0.026786,u,g,c,v,0.070175,f,t,0.029851,t,g,0.1000,0.00394,-
687,a,0.172932,0.482143,y,p,ff,ff,0.070175,f,t,0.014925,t,g,0.1000,0.00001,-
688,b,0.062707,0.007321,u,g,aa,v,0.001404,f,f,0.000000,f,g,0.1400,0.00750,-


In [7]:
#second way for normalizing the data 
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
raw_data[cont_att] = min_max.fit_transform(raw_data[cont_att])
raw_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,0.256842,0.000000,u,g,w,v,0.043860,t,t,0.014925,f,g,0.1010,0.00000,+
1,a,0.675489,0.159286,u,g,q,h,0.106667,t,t,0.089552,f,g,0.0215,0.00560,+
2,a,0.161654,0.017857,u,g,q,h,0.052632,t,f,0.000000,f,g,0.1400,0.00824,+
3,b,0.211729,0.055000,u,g,w,v,0.131579,t,t,0.074627,t,g,0.0500,0.00003,+
4,b,0.096541,0.200893,u,g,w,v,0.060000,t,f,0.000000,f,s,0.0600,0.00000,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,0.110226,0.360179,y,p,e,h,0.043860,f,f,0.000000,f,g,0.1300,0.00000,-
686,a,0.134135,0.026786,u,g,c,v,0.070175,f,t,0.029851,t,g,0.1000,0.00394,-
687,a,0.172932,0.482143,y,p,ff,ff,0.070175,f,t,0.014925,t,g,0.1000,0.00001,-
688,b,0.062707,0.007321,u,g,aa,v,0.001404,f,f,0.000000,f,g,0.1400,0.00750,-


In [8]:
#another way is just make normalized data another dataframe and then mix it with the first datframe
x  = raw_data[cont_att].values
x_scaled = min_max.fit_transform(x)

normalized_data = pd.DataFrame(x_scaled, columns=cont_att, index=raw_data.index)
raw_data[cont_att] = normalized_data
raw_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,0.256842,0.000000,u,g,w,v,0.043860,t,t,0.014925,f,g,0.1010,0.00000,+
1,a,0.675489,0.159286,u,g,q,h,0.106667,t,t,0.089552,f,g,0.0215,0.00560,+
2,a,0.161654,0.017857,u,g,q,h,0.052632,t,f,0.000000,f,g,0.1400,0.00824,+
3,b,0.211729,0.055000,u,g,w,v,0.131579,t,t,0.074627,t,g,0.0500,0.00003,+
4,b,0.096541,0.200893,u,g,w,v,0.060000,t,f,0.000000,f,s,0.0600,0.00000,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,0.110226,0.360179,y,p,e,h,0.043860,f,f,0.000000,f,g,0.1300,0.00000,-
686,a,0.134135,0.026786,u,g,c,v,0.070175,f,t,0.029851,t,g,0.1000,0.00394,-
687,a,0.172932,0.482143,y,p,ff,ff,0.070175,f,t,0.014925,t,g,0.1000,0.00001,-
688,b,0.062707,0.007321,u,g,aa,v,0.001404,f,f,0.000000,f,g,0.1400,0.00750,-


In [38]:
#turning catagorical data to numerical
#first way with ordinal encoder
encoder = OrdinalEncoder(dtype = np.int32)
raw_data[15] = encoder.fit_transform(raw_data[[15]]) #turning the last colum to numerical 
raw_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,0.256842,0.000000,u,g,w,v,0.043860,t,t,0.014925,f,g,0.1010,0.00000,0
1,a,0.675489,0.159286,u,g,q,h,0.106667,t,t,0.089552,f,g,0.0215,0.00560,0
2,a,0.161654,0.017857,u,g,q,h,0.052632,t,f,0.000000,f,g,0.1400,0.00824,0
3,b,0.211729,0.055000,u,g,w,v,0.131579,t,t,0.074627,t,g,0.0500,0.00003,0
4,b,0.096541,0.200893,u,g,w,v,0.060000,t,f,0.000000,f,s,0.0600,0.00000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,0.110226,0.360179,y,p,e,h,0.043860,f,f,0.000000,f,g,0.1300,0.00000,1
686,a,0.134135,0.026786,u,g,c,v,0.070175,f,t,0.029851,t,g,0.1000,0.00394,1
687,a,0.172932,0.482143,y,p,ff,ff,0.070175,f,t,0.014925,t,g,0.1000,0.00001,1
688,b,0.062707,0.007321,u,g,aa,v,0.001404,f,f,0.000000,f,g,0.1400,0.00750,1


In [49]:
one_hot  = OneHotEncoder(sparse = False)
# Convert discrete attributes to strings if they're integers
dis_att_str = [str(col) for col in dis_att]
# One hot encoding
one_hot_data = one_hot.fit_transform(raw_data[dis_att])
one_hot_data = pd.DataFrame(one_hot_data, 
                          columns=one_hot.get_feature_names_out(dis_att_str),  # Use string version
                          index=raw_data.index)
# Concatenate with original data
raw_data = pd.concat([raw_data, one_hot_data], axis=1)
# Drop original discrete columns
raw_data.drop(dis_att, axis=1, inplace=True)
# Resulting dataframe
raw_data



Unnamed: 0,0,1,2,7,10,13,14,3_l,3_u,3_y,...,8_t,9_f,9_t,11_f,11_t,12_g,12_p,12_s,15_+,15_-
0,b,0.256842,0.000000,0.043860,0.014925,0.1010,0.00000,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,a,0.675489,0.159286,0.106667,0.089552,0.0215,0.00560,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,a,0.161654,0.017857,0.052632,0.000000,0.1400,0.00824,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,b,0.211729,0.055000,0.131579,0.074627,0.0500,0.00003,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,b,0.096541,0.200893,0.060000,0.000000,0.0600,0.00000,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,0.110226,0.360179,0.043860,0.000000,0.1300,0.00000,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
686,a,0.134135,0.026786,0.070175,0.029851,0.1000,0.00394,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
687,a,0.172932,0.482143,0.070175,0.014925,0.1000,0.00001,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
688,b,0.062707,0.007321,0.001404,0.000000,0.1400,0.00750,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [39]:
#another way to do one hot encoding is with pandas
one_got = pd.get_dummies(raw_data[dis_att]) #drop_first is used to drop the first column of the one hot encoded data
raw_data = raw_data.drop(dis_att, axis=1) #dropping the original discrete attributes from raw_data
df = raw_data.join(one_got) #joining the one hot encoded data with the original data
df

Unnamed: 0,0,1,2,7,10,13,14,15,3_l,3_u,...,6_z,8_f,8_t,9_f,9_t,11_f,11_t,12_g,12_p,12_s
0,b,0.256842,0.000000,0.043860,0.014925,0.1010,0.00000,0,False,True,...,False,False,True,False,True,True,False,True,False,False
1,a,0.675489,0.159286,0.106667,0.089552,0.0215,0.00560,0,False,True,...,False,False,True,False,True,True,False,True,False,False
2,a,0.161654,0.017857,0.052632,0.000000,0.1400,0.00824,0,False,True,...,False,False,True,True,False,True,False,True,False,False
3,b,0.211729,0.055000,0.131579,0.074627,0.0500,0.00003,0,False,True,...,False,False,True,False,True,False,True,True,False,False
4,b,0.096541,0.200893,0.060000,0.000000,0.0600,0.00000,0,False,True,...,False,False,True,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,0.110226,0.360179,0.043860,0.000000,0.1300,0.00000,1,False,False,...,False,True,False,True,False,True,False,True,False,False
686,a,0.134135,0.026786,0.070175,0.029851,0.1000,0.00394,1,False,True,...,False,True,False,False,True,False,True,True,False,False
687,a,0.172932,0.482143,0.070175,0.014925,0.1000,0.00001,1,False,False,...,False,True,False,False,True,False,True,True,False,False
688,b,0.062707,0.007321,0.001404,0.000000,0.1400,0.00750,1,False,True,...,False,True,False,True,False,True,False,True,False,False


In [41]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=15)
y = df[15]

X_train, y_train, X_test, y_test = train_test_split(X, y , test_size=0.2)