# Data Preprocessing

### Importing modules

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

### Importing Dataset

In [2]:
data=pd.read_csv('EGG.csv')

### Checking Initial Rows

In [3]:
data.head()

Unnamed: 0,id,age,bp,hemo,insulin,fasting,postprandial,final,cpm,classification,condition
0,0,48,80.0,15.4,160,-0.101,0.122,0.123,1,bradygastria,1
1,1,7,50.0,11.3,158,-0.008,0.302,0.311,3,normal,0
2,2,62,80.0,9.6,170,0.212,0.543,0.522,5,trachygastria,1
3,3,48,70.0,11.2,188,-0.235,0.788,0.712,7,trachygastria,1
4,4,51,80.0,11.6,180,0.286,0.348,0.35,3,normal,0


### Deleting Unwanted Columns

In [4]:
del data['id']
del data['condition']

### Checking the shape of our data frame

In [5]:
data.shape

(1201, 9)

### Checking our data after removing unwanted column

In [6]:
data.head()

Unnamed: 0,age,bp,hemo,insulin,fasting,postprandial,final,cpm,classification
0,48,80.0,15.4,160,-0.101,0.122,0.123,1,bradygastria
1,7,50.0,11.3,158,-0.008,0.302,0.311,3,normal
2,62,80.0,9.6,170,0.212,0.543,0.522,5,trachygastria
3,48,70.0,11.2,188,-0.235,0.788,0.712,7,trachygastria
4,51,80.0,11.6,180,0.286,0.348,0.35,3,normal


### Checking for Null Values in each column

In [7]:
data.isnull().sum()

age                 0
bp                 12
hemo              129
insulin             0
fasting             0
postprandial        0
final               0
cpm                 0
classification      0
dtype: int64

### Removing null data containing rows

In [8]:
df=data.dropna()

### Describing each Columns in our dataset

In [9]:
df.describe()

Unnamed: 0,age,bp,hemo,insulin,fasting,postprandial,final,cpm
count,1060.0,1060.0,1060.0,1060.0,1060.0,1060.0,1060.0,1060.0
mean,56.037736,81.075472,10.846226,176.398113,0.275673,0.449804,0.415087,3.903774
std,14.313847,16.421443,2.61303,14.388905,0.254632,0.294437,0.3323,3.005532
min,7.0,50.0,4.8,140.0,-0.235,-0.12,-0.219,-1.0
25%,48.0,70.0,9.7,170.0,0.123,0.276,0.244,2.0
50%,59.0,80.0,10.5,180.0,0.305,0.433,0.352,3.0
75%,65.0,90.0,12.1,188.0,0.471,0.643,0.583,5.0
max,82.0,180.0,25.0,198.0,0.834,1.23,1.44,14.0


### Checking information Regarding Data

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1060 entries, 0 to 1200
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             1060 non-null   int64  
 1   bp              1060 non-null   float64
 2   hemo            1060 non-null   float64
 3   insulin         1060 non-null   int64  
 4   fasting         1060 non-null   float64
 5   postprandial    1060 non-null   float64
 6   final           1060 non-null   float64
 7   cpm             1060 non-null   int64  
 8   classification  1060 non-null   object 
dtypes: float64(5), int64(3), object(1)
memory usage: 82.8+ KB


### Checking correlation

In [11]:
df.corr()

Unnamed: 0,age,bp,hemo,insulin,fasting,postprandial,final,cpm
age,1.0,0.052856,0.009143,0.135582,-0.061901,0.029751,0.018859,0.020893
bp,0.052856,1.0,-0.046845,-0.053127,0.019704,-0.038954,0.001976,0.007839
hemo,0.009143,-0.046845,1.0,-0.089467,0.052664,0.127812,0.060206,0.04984
insulin,0.135582,-0.053127,-0.089467,1.0,0.1616,0.082479,0.063239,0.069143
fasting,-0.061901,0.019704,0.052664,0.1616,1.0,0.473284,0.275888,0.272735
postprandial,0.029751,-0.038954,0.127812,0.082479,0.473284,1.0,0.872661,0.849806
final,0.018859,0.001976,0.060206,0.063239,0.275888,0.872661,1.0,0.977376
cpm,0.020893,0.007839,0.04984,0.069143,0.272735,0.849806,0.977376,1.0


### Checking unique values in age column

In [12]:
np.sort(df['age'].unique())

array([ 7, 11, 15, 24, 26, 32, 35, 38, 40, 42, 44, 45, 46, 47, 48, 50, 51,
       52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 82], dtype=int64)

### Checking Unique values in BP Column

In [13]:
np.sort(df['bp'].unique())

array([ 50.,  60.,  70.,  80.,  90., 100., 110., 140., 180.])

### Checking unique values in insulin

In [14]:
np.sort(df['insulin'].unique())

array([140, 158, 160, 165, 170, 174, 176, 178, 180, 183, 186, 188, 190,
       192, 193, 198], dtype=int64)

### Checking unique values in cpm

In [15]:
np.sort(df['cpm'].unique())

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 12, 13, 14],
      dtype=int64)

In [16]:
pd.options.display.max_columns=250
pd.crosstab(df['insulin'],df['fasting'])

fasting,-0.235,-0.219,-0.125,-0.101,-0.008,-0.006,0.120,0.123,0.172,0.176,0.204,0.212,0.232,0.245,0.257,0.280,0.286,0.295,0.305,0.331,0.332,0.333,0.346,0.348,0.367,0.379,0.388,0.390,0.418,0.471,0.485,0.507,0.529,0.533,0.562,0.571,0.636,0.834
insulin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
140,0,0,12,12,12,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0
158,0,0,0,24,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,12,12,0,0,0,0,12,0,0,0,24,0,0
160,12,0,0,7,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0
165,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,24,0,0
170,0,0,0,0,21,12,0,0,0,0,0,12,0,0,0,0,0,0,0,12,12,0,0,0,12,12,0,0,0,0,0,0,0,0,12,0,0,0
174,0,0,0,0,0,0,0,0,0,0,12,0,0,12,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0
176,0,0,0,0,0,0,12,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,12,0,12,0,0,0,0,24,12,0,0,0,0,0,0,0
178,0,0,0,0,0,0,0,0,0,12,0,0,0,0,12,0,12,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
180,24,0,12,0,12,0,0,0,0,0,0,12,0,0,0,0,12,12,0,0,0,0,0,0,0,0,0,0,0,12,0,0,24,0,0,0,0,0
183,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,12,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
pd.options.display.max_columns=250
pd.crosstab(df['classification'],df['fasting'])

fasting,-0.235,-0.219,-0.125,-0.101,-0.008,-0.006,0.120,0.123,0.172,0.176,0.204,0.212,0.232,0.245,0.257,0.280,0.286,0.295,0.305,0.331,0.332,0.333,0.346,0.348,0.367,0.379,0.388,0.390,0.418,0.471,0.485,0.507,0.529,0.533,0.562,0.571,0.636,0.834
classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
bradygastria,0,24,24,55,0,24,24,24,24,12,0,0,12,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,48,0,0
normal,0,0,0,0,57,0,0,0,0,12,24,12,0,24,12,0,24,24,24,24,12,24,12,36,24,24,24,12,24,0,0,12,0,0,24,0,0,0
trachygastria,48,0,0,0,0,0,0,0,0,0,0,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,0,12,48,24,0,0,48,24


### Checking Classification

In [19]:
df['classification'].value_counts()

normal           465
bradygastria     319
trachygastria    276
Name: classification, dtype: int64

### Label Encoding the Classification column

In [23]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['classification']=le.fit_transform(df['classification']).astype(int)

In [24]:
df

Unnamed: 0,age,bp,hemo,insulin,fasting,postprandial,final,cpm,classification
0,48,80.0,15.4,160,-0.101,0.122,0.123,1,0
1,7,50.0,11.3,158,-0.008,0.302,0.311,3,1
2,62,80.0,9.6,170,0.212,0.543,0.522,5,2
3,48,70.0,11.2,188,-0.235,0.788,0.712,7,2
4,51,80.0,11.6,180,0.286,0.348,0.350,3,1
...,...,...,...,...,...,...,...,...,...
1196,65,70.0,11.6,140,-0.101,0.122,0.123,1,0
1197,62,90.0,25.0,180,-0.008,0.302,0.311,3,1
1198,60,80.0,11.2,140,0.212,0.543,0.522,5,2
1199,65,60.0,10.0,180,-0.235,0.788,0.988,9,2
