# **Task 1: Import libraries**

In [6]:
import pandas as pd
import numpy as np

# **Task 2: Normalization**

### *1. Import database*

In [7]:
#access the clean_data.csv created through Data Exploration 
data = pd.read_csv('clean_data.csv')

#Show the content of the clean_data.csv
columnsNames = data.columns
database = pd.DataFrame(data, columns = columnsNames)
database.head()

Unnamed: 0,diagnosis,ID,imaginary_min,imaginary_avg,real_min,real_avg,gender,age,smoking,age_bin
0,COPD,301-4,-320.61,-300.563531,-495.26,-464.171991,1.0,77,2.0,70-79
1,COPD,302-3,-325.39,-314.75036,-473.73,-469.26314,0.0,72,2.0,70-79
2,COPD,303-3,-323.0,-317.436056,-476.12,-471.897667,1.0,73,3.0,70-79
3,COPD,304-4,-327.78,-317.39967,-473.73,-468.856388,1.0,76,2.0,70-79
4,COPD,305-4,-325.39,-316.155785,-478.52,-472.869783,0.0,65,2.0,60-69


### *2. Check the type of the columns*

In [8]:
# Check the type of the columns
null_ = database.isna().any()
dtypes = database.dtypes

info = pd.concat([null_, dtypes], axis = 1, keys = ['Null', 'type'])
print(info)

                Null     type
diagnosis      False   object
ID             False   object
imaginary_min  False  float64
imaginary_avg  False  float64
real_min       False  float64
real_avg       False  float64
gender         False  float64
age            False    int64
smoking        False  float64
age_bin        False   object


### *3. Convert all columns which are not numeric to numeric.*

In [9]:
#Select all the columns that are not numeric 
data_column_category = database.select_dtypes(exclude=[np.number]).columns
# Convert columns which are not numeric to numeric data
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for i in data_column_category:
    database[i] = label_encoder.fit_transform(database[i])

database.head()

Unnamed: 0,diagnosis,ID,imaginary_min,imaginary_avg,real_min,real_avg,gender,age,smoking,age_bin
0,1,95,-320.61,-300.563531,-495.26,-464.171991,1.0,77,2.0,6
1,1,96,-325.39,-314.75036,-473.73,-469.26314,0.0,72,2.0,6
2,1,97,-323.0,-317.436056,-476.12,-471.897667,1.0,73,3.0,6
3,1,98,-327.78,-317.39967,-473.73,-468.856388,1.0,76,2.0,6
4,1,99,-325.39,-316.155785,-478.52,-472.869783,0.0,65,2.0,5


### *4. Normalization*

In [10]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
names = database.columns
d = scaler.fit_transform(database)
normalization = pd.DataFrame(d, columns=names)
normalization.head()

Unnamed: 0,diagnosis,ID,imaginary_min,imaginary_avg,real_min,real_avg,gender,age,smoking,age_bin
0,0.333333,0.238693,0.148999,0.268369,0.225783,0.022694,1.0,0.789474,0.5,0.75
1,0.333333,0.241206,0.106453,0.131008,0.262722,0.010852,0.0,0.723684,0.5,0.75
2,0.333333,0.243719,0.127726,0.105004,0.258621,0.004724,1.0,0.736842,1.0,0.75
3,0.333333,0.246231,0.08518,0.105356,0.262722,0.011798,1.0,0.776316,0.5,0.75
4,0.333333,0.248744,0.106453,0.1174,0.254504,0.002463,0.0,0.631579,0.5,0.625


# **Task 3: Standardization**

### *1. Import database*

In [61]:
#access the clean_data.csv created through Data Exploration 
data = pd.read_csv('clean_data.csv')

#Show the content of the clean_data.csv
columnsNames = data.columns
database = pd.DataFrame(data, columns = columnsNames)
database.head()

Unnamed: 0,diagnosis,ID,imaginary_min,imaginary_avg,real_min,real_avg,gender,age,smoking,age_bin
0,COPD,301-4,-320.61,-300.563531,-495.26,-464.171991,1.0,77,2.0,70-79
1,COPD,302-3,-325.39,-314.75036,-473.73,-469.26314,0.0,72,2.0,70-79
2,COPD,303-3,-323.0,-317.436056,-476.12,-471.897667,1.0,73,3.0,70-79
3,COPD,304-4,-327.78,-317.39967,-473.73,-468.856388,1.0,76,2.0,70-79
4,COPD,305-4,-325.39,-316.155785,-478.52,-472.869783,0.0,65,2.0,60-69


### *2. Check the type of the columns*

In [62]:
# Check the type of the columns
null_ = database.isna().any()
dtypes = database.dtypes

info = pd.concat([null_, dtypes], axis = 1, keys = ['Null', 'type'])
print(info)

                Null     type
diagnosis      False   object
ID             False   object
imaginary_min  False  float64
imaginary_avg  False  float64
real_min       False  float64
real_avg       False  float64
gender         False  float64
age            False    int64
smoking        False  float64
age_bin        False   object


### *3. Convert all columns which are not numeric to numeric.*

In [63]:
#Select all the columns that are not numeric 
data_column_category = database.select_dtypes(exclude=[np.number]).columns
# Convert columns which are not numeric to numeric data
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for i in data_column_category:
    database[i] = label_encoder.fit_transform(database[i])

database.head()

Unnamed: 0,diagnosis,ID,imaginary_min,imaginary_avg,real_min,real_avg,gender,age,smoking,age_bin
0,1,95,-320.61,-300.563531,-495.26,-464.171991,1.0,77,2.0,6
1,1,96,-325.39,-314.75036,-473.73,-469.26314,0.0,72,2.0,6
2,1,97,-323.0,-317.436056,-476.12,-471.897667,1.0,73,3.0,6
3,1,98,-327.78,-317.39967,-473.73,-468.856388,1.0,76,2.0,6
4,1,99,-325.39,-316.155785,-478.52,-472.869783,0.0,65,2.0,5


### *4. Standardization*

In [64]:
# from sklearn.preprocessing and implement the fit_transorm() method

from sklearn import preprocessing

std_scale = preprocessing.StandardScaler().fit_transform(database)

scaled_frame = pd.DataFrame(std_scale, columns=database.columns)

scaled_frame.head()

Unnamed: 0,diagnosis,ID,imaginary_min,imaginary_avg,real_min,real_avg,gender,age,smoking,age_bin
0,-0.589341,-0.902927,0.635816,0.282133,1.307657,-0.275274,1.22859,1.52414,0.36855,1.419456
1,-0.589341,-0.894245,0.356005,-0.819933,1.612371,-0.509001,-0.813941,1.254506,0.36855,1.419456
2,-0.589341,-0.885563,0.495911,-1.028564,1.578545,-0.629948,1.22859,1.308433,1.717648,1.419456
3,-0.589341,-0.876881,0.2161,-1.025738,1.612371,-0.490327,1.22859,1.470213,0.36855,1.419456
4,-0.589341,-0.868199,0.356005,-0.92911,1.544578,-0.674576,-0.813941,0.877019,0.36855,0.894073


# **Other required techniques**

*Convert non-numeric columns to numeric columns*