## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

## Loading Data

In [2]:
main_data = pd.read_csv('glass source classification dataset.csv')
main_data

Unnamed: 0.1,Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,0,1.52101,13.64,4.49,1.10,71.78,0.06,,Does not exist,Does not exist,building_window glass
1,1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,Does not exist,Does not exist,building_window glass
2,2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,Does not exist,Does not exist,building_window glass
3,3,1.51766,13.21,3.69,1.29,72.61,0.57,,Does not exist,Does not exist,building_window glass
4,4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,Does not exist,Does not exist,building_window glass
...,...,...,...,...,...,...,...,...,...,...,...
209,209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,exists,Does not exist,headlamp glass
210,210,1.51685,14.92,0.00,1.99,73.06,0.00,,exists,Does not exist,headlamp glass
211,211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,exists,Does not exist,headlamp glass
212,212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,exists,Does not exist,headlamp glass


## Checking For And Handling Null Values

In [3]:
updated_data = main_data
updated_data.isnull().sum()

Unnamed: 0    0
RI            0
Na            0
Mg            0
Al            0
Si            0
K             0
Ca            6
Ba            0
Fe            0
Type          0
dtype: int64

In [4]:
impute = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
impute.fit(updated_data[['Ca']])
updated_data['Ca'] = impute.transform(updated_data[['Ca']])

In [5]:
updated_data.isnull().sum()

Unnamed: 0    0
RI            0
Na            0
Mg            0
Al            0
Si            0
K             0
Ca            0
Ba            0
Fe            0
Type          0
dtype: int64

## Encoding Categorical Features

In [6]:
encoded_data = updated_data
encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  214 non-null    int64  
 1   RI          214 non-null    float64
 2   Na          214 non-null    float64
 3   Mg          214 non-null    float64
 4   Al          214 non-null    float64
 5   Si          214 non-null    float64
 6   K           214 non-null    float64
 7   Ca          214 non-null    float64
 8   Ba          214 non-null    object 
 9   Fe          214 non-null    object 
 10  Type        214 non-null    object 
dtypes: float64(7), int64(1), object(3)
memory usage: 18.5+ KB


In [7]:
encoded_data['Ba'].unique()

array(['Does not exist', 'exists'], dtype=object)

In [8]:
encoded_data['Ba'] = encoded_data['Ba'].map({'Does not exist':0, 'exists':1})

In [9]:
encoded_data['Fe'].unique()

array(['Does not exist', 'exists'], dtype=object)

In [10]:
encoded_data['Fe'] = encoded_data['Fe'].map({'Does not exist':0, 'exists':1})

In [11]:
encoded_data['Type'].unique()

array(['building_window glass', 'vehicle_window glass', 'container glass',
       'tableware glass', 'headlamp glass'], dtype=object)

In [12]:
enc = pd.get_dummies(encoded_data['Type'])
enc

Unnamed: 0,building_window glass,container glass,headlamp glass,tableware glass,vehicle_window glass
0,1,0,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
...,...,...,...,...,...
209,0,0,1,0,0
210,0,0,1,0,0
211,0,0,1,0,0
212,0,0,1,0,0


## Scaling Values

In [13]:
temp_data = encoded_data.drop(['Unnamed: 0', 'Type'], axis = 1)
scaler = MinMaxScaler()
scaler.fit(temp_data)

MinMaxScaler()

In [14]:
data_scaled = scaler.transform(temp_data)
data_scaled

array([[0.43283582, 0.43759398, 1.        , ..., 0.24163569, 0.        ,
        0.        ],
       [0.28358209, 0.47518797, 0.80178174, ..., 0.22304833, 0.        ,
        0.        ],
       [0.22080773, 0.42105263, 0.79064588, ..., 0.21840149, 0.        ,
        0.        ],
       ...,
       [0.41703248, 0.54586466, 0.        , ..., 0.27973978, 1.        ,
        0.        ],
       [0.23529412, 0.54887218, 0.        , ..., 0.28345725, 1.        ,
        0.        ],
       [0.26163301, 0.52631579, 0.        , ..., 0.2964684 , 1.        ,
        0.        ]])

In [15]:
data_scaled = pd.DataFrame(data = data_scaled, columns = ['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe'])
data_scaled

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,0.432836,0.437594,1.000000,0.252336,0.351786,0.009662,0.241636,0.0,0.0
1,0.283582,0.475188,0.801782,0.333333,0.521429,0.077295,0.223048,0.0,0.0
2,0.220808,0.421053,0.790646,0.389408,0.567857,0.062802,0.218401,0.0,0.0
3,0.285777,0.372932,0.821826,0.311526,0.500000,0.091787,0.241636,0.0,0.0
4,0.275241,0.381955,0.806236,0.295950,0.583929,0.088567,0.245353,0.0,0.0
...,...,...,...,...,...,...,...,...,...
209,0.223003,0.512782,0.000000,0.806854,0.500000,0.012882,0.348513,1.0,0.0
210,0.250219,0.630075,0.000000,0.529595,0.580357,0.000000,0.241636,1.0,0.0
211,0.417032,0.545865,0.000000,0.538941,0.644643,0.000000,0.279740,1.0,0.0
212,0.235294,0.548872,0.000000,0.514019,0.678571,0.000000,0.283457,1.0,0.0


## Spliting Features And Labels

In [16]:
final_data = pd.concat([data_scaled, enc], axis = 1)
final_data

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,building_window glass,container glass,headlamp glass,tableware glass,vehicle_window glass
0,0.432836,0.437594,1.000000,0.252336,0.351786,0.009662,0.241636,0.0,0.0,1,0,0,0,0
1,0.283582,0.475188,0.801782,0.333333,0.521429,0.077295,0.223048,0.0,0.0,1,0,0,0,0
2,0.220808,0.421053,0.790646,0.389408,0.567857,0.062802,0.218401,0.0,0.0,1,0,0,0,0
3,0.285777,0.372932,0.821826,0.311526,0.500000,0.091787,0.241636,0.0,0.0,1,0,0,0,0
4,0.275241,0.381955,0.806236,0.295950,0.583929,0.088567,0.245353,0.0,0.0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,0.223003,0.512782,0.000000,0.806854,0.500000,0.012882,0.348513,1.0,0.0,0,0,1,0,0
210,0.250219,0.630075,0.000000,0.529595,0.580357,0.000000,0.241636,1.0,0.0,0,0,1,0,0
211,0.417032,0.545865,0.000000,0.538941,0.644643,0.000000,0.279740,1.0,0.0,0,0,1,0,0
212,0.235294,0.548872,0.000000,0.514019,0.678571,0.000000,0.283457,1.0,0.0,0,0,1,0,0


In [17]:
features_data, label_data = final_data.iloc[:,[0,1,2,3,4,5,6,7,8]], final_data.iloc[:,[9,10,11,12,13]]

In [18]:
features_data

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,0.432836,0.437594,1.000000,0.252336,0.351786,0.009662,0.241636,0.0,0.0
1,0.283582,0.475188,0.801782,0.333333,0.521429,0.077295,0.223048,0.0,0.0
2,0.220808,0.421053,0.790646,0.389408,0.567857,0.062802,0.218401,0.0,0.0
3,0.285777,0.372932,0.821826,0.311526,0.500000,0.091787,0.241636,0.0,0.0
4,0.275241,0.381955,0.806236,0.295950,0.583929,0.088567,0.245353,0.0,0.0
...,...,...,...,...,...,...,...,...,...
209,0.223003,0.512782,0.000000,0.806854,0.500000,0.012882,0.348513,1.0,0.0
210,0.250219,0.630075,0.000000,0.529595,0.580357,0.000000,0.241636,1.0,0.0
211,0.417032,0.545865,0.000000,0.538941,0.644643,0.000000,0.279740,1.0,0.0
212,0.235294,0.548872,0.000000,0.514019,0.678571,0.000000,0.283457,1.0,0.0


In [19]:
label_data

Unnamed: 0,building_window glass,container glass,headlamp glass,tableware glass,vehicle_window glass
0,1,0,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
...,...,...,...,...,...
209,0,0,1,0,0
210,0,0,1,0,0
211,0,0,1,0,0
212,0,0,1,0,0
