**Importing the Datafile**

In [1]:
# import appropriate packages
import numpy as np
import pandas as pd

In [2]:
# read the IQR excel file 
dfx = pd.read_csv("../csv_files/p1iqr.csv", index_col=0)
dfx.head(10)

Unnamed: 0,C1,C4,C7,C5p,C6p,T4p,T3p,T5p,S1p,S2p,S3p,C2,C3p
0,122.0,0.029074,51.345,3.864345,11.111111,0.640426,0.908876,0.05425,0.004875,0.009199,0.010929,1.0,0
1,254.375,-0.013352,25.936,7.882541,0.0,0.644753,0.898724,0.051395,0.003258,0.011105,0.010876,0.0,1
2,90.0,0.020715,7.378,3.369134,0.0,0.636816,0.90935,0.061764,0.009423,0.006271,0.011403,1.0,1
3,209.0,0.020023,8.526,3.299697,0.0,0.539634,0.91706,0.06163,0.009423,0.007144,0.013319,1.0,1
4,80.0,-0.034895,632.298,3.726269,5.0,0.587413,0.888469,0.04855,0.004518,0.010047,0.011261,1.0,0
5,94.0,-0.057556,197.591,5.724394,0.0,0.643137,0.843465,0.04528,0.003445,0.005334,0.010056,1.0,0
6,128.0,0.004106,5.146,3.463095,0.0,0.765657,0.885455,0.044295,0.005204,0.006727,0.008567,1.0,1
7,91.0,0.04062,103.833,2.421469,0.0,0.817198,0.895778,0.033557,0.004899,0.010689,0.010868,0.0,1
8,100.0,-0.029316,279.6,4.816591,0.0,0.540323,0.908864,0.061905,0.003915,0.008371,0.015797,1.0,1
9,106.0,-0.024925,494.008,3.056608,0.0,0.514286,0.887563,0.050252,0.008868,0.009853,0.016094,1.0,0


In [3]:
# filter out binary variables 
# will concat with dfx later in the notebook 
dfbinary = dfx.filter(['C2', 'C3p'])
dfbinary.head()

Unnamed: 0,C2,C3p
0,1.0,0
1,0.0,1
2,1.0,1
3,1.0,1
4,1.0,0


In [4]:
# drop binary variable from working dataframe
dfx = dfx.drop(labels=['C2', 'C3p'], axis=1)
dfx.head()

Unnamed: 0,C1,C4,C7,C5p,C6p,T4p,T3p,T5p,S1p,S2p,S3p
0,122.0,0.029074,51.345,3.864345,11.111111,0.640426,0.908876,0.05425,0.004875,0.009199,0.010929
1,254.375,-0.013352,25.936,7.882541,0.0,0.644753,0.898724,0.051395,0.003258,0.011105,0.010876
2,90.0,0.020715,7.378,3.369134,0.0,0.636816,0.90935,0.061764,0.009423,0.006271,0.011403
3,209.0,0.020023,8.526,3.299697,0.0,0.539634,0.91706,0.06163,0.009423,0.007144,0.013319
4,80.0,-0.034895,632.298,3.726269,5.0,0.587413,0.888469,0.04855,0.004518,0.010047,0.011261


In [5]:
# checking for null values 
print(dfx.isnull().sum(axis=0).tolist())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [6]:
# exmaining the columns datatypes and number of records 
dfx.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 682 entries, 0 to 681
Data columns (total 11 columns):
C1     682 non-null float64
C4     682 non-null float64
C7     682 non-null float64
C5p    682 non-null float64
C6p    682 non-null float64
T4p    682 non-null float64
T3p    682 non-null float64
T5p    682 non-null float64
S1p    682 non-null float64
S2p    682 non-null float64
S3p    682 non-null float64
dtypes: float64(11)
memory usage: 63.9 KB


**Normalizing the Data**

In [7]:
# define the function that will normalize the data 
def normalize(col):
    mean = np.mean(col)
    low = min(col)
    high = max(col)
    colnorm = (col - mean) / (high - low)
    return colnorm

In [9]:
# apply the function to the appropriate columns 
columns = ['C1', 'C4', 'C5p', 'C6p', 'C7', 'T3p', 'T4p', 'T5p', 'S1p', 'S2p', 'S3p']
dfx_norm = pd.DataFrame()
for column in columns:
    dfx_norm[column] = normalize(dfx[column])
dfx_norm.head()

Unnamed: 0,C1,C4,C5p,C6p,C7,T3p,T4p,T5p,S1p,S2p,S3p
0,-0.014916,0.12483,-0.020758,0.391237,-0.229828,0.138214,0.031337,0.061905,-0.047511,0.017797,-0.036163
1,0.526772,-0.120852,0.507999,-0.230985,-0.268208,0.042365,0.042739,-0.016366,-0.238552,0.153019,-0.040186
2,-0.145862,0.076423,-0.085923,-0.230985,-0.296238,0.142686,0.021828,0.267938,0.489965,-0.189876,-0.000178
3,0.341095,0.072416,-0.095061,-0.230985,-0.294504,0.215479,-0.23419,0.264268,0.489965,-0.127992,0.145308
4,-0.186783,-0.245605,-0.038928,0.049015,0.647673,-0.054455,-0.108321,-0.094369,-0.089669,0.07798,-0.010925


In [10]:
# import the preprocessing package from sklearn
from sklearn import preprocessing

# scale all columns to have a range of 0-1
x = dfx[columns].values
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
dfx_scaled = pd.DataFrame(x_scaled,columns=columns)
dfx_scaled.head()

Unnamed: 0,C1,C4,C5p,C6p,C7,T3p,T4p,T5p,S1p,S2p,S3p
0,0.458312,0.630418,0.471242,0.622222,0.077442,0.617569,0.534307,0.567375,0.462525,0.532463,0.478776
1,1.0,0.384735,1.0,0.0,0.039063,0.52172,0.545709,0.489104,0.271483,0.667685,0.474754
2,0.327366,0.582011,0.406077,0.0,0.011032,0.62204,0.524798,0.773408,1.0,0.32479,0.514761
3,0.814322,0.578004,0.39694,0.0,0.012766,0.694834,0.26878,0.769738,1.0,0.386674,0.660248
4,0.286445,0.259982,0.453073,0.28,0.954944,0.4249,0.394649,0.411101,0.420367,0.592646,0.504014


In [11]:
# verify that the min and max values for each column are 0 and 1 respectively 
dfx_scaled.describe()

Unnamed: 0,C1,C4,C5p,C6p,C7,T3p,T4p,T5p,S1p,S2p,S3p
count,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0
mean,0.473228,0.505588,0.492001,0.230985,0.307271,0.479355,0.50297,0.50547,0.510035,0.514666,0.514939
std,0.253023,0.186131,0.21863,0.34428,0.336886,0.208693,0.18531,0.188978,0.19883,0.194852,0.194799
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.306905,0.384735,0.338217,0.0,0.062589,0.375,0.375,0.375,0.375,0.375,0.375
50%,0.396931,0.514892,0.4537,0.0,0.156723,0.504929,0.504247,0.499945,0.480602,0.488368,0.505726
75%,0.584143,0.641226,0.60293,0.4,0.437554,0.625,0.625,0.625,0.625,0.625,0.625
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# concat scaled dataframe with binary dataframe 
dfx = pd.concat([dfx_scaled, dfbinary], axis=1)
dfx.head()

Unnamed: 0,C1,C4,C5p,C6p,C7,T3p,T4p,T5p,S1p,S2p,S3p,C2,C3p
0,0.458312,0.630418,0.471242,0.622222,0.077442,0.617569,0.534307,0.567375,0.462525,0.532463,0.478776,1.0,0
1,1.0,0.384735,1.0,0.0,0.039063,0.52172,0.545709,0.489104,0.271483,0.667685,0.474754,0.0,1
2,0.327366,0.582011,0.406077,0.0,0.011032,0.62204,0.524798,0.773408,1.0,0.32479,0.514761,1.0,1
3,0.814322,0.578004,0.39694,0.0,0.012766,0.694834,0.26878,0.769738,1.0,0.386674,0.660248,1.0,1
4,0.286445,0.259982,0.453073,0.28,0.954944,0.4249,0.394649,0.411101,0.420367,0.592646,0.504014,1.0,0


In [13]:
# verify that new dfx dataframe has the scaled variables 
dfx.describe()

Unnamed: 0,C1,C4,C5p,C6p,C7,T3p,T4p,T5p,S1p,S2p,S3p,C2,C3p
count,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0
mean,0.473228,0.505588,0.492001,0.230985,0.307271,0.479355,0.50297,0.50547,0.510035,0.514666,0.514939,0.863636,0.463343
std,0.253023,0.186131,0.21863,0.34428,0.336886,0.208693,0.18531,0.188978,0.19883,0.194852,0.194799,0.343426,0.49902
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.306905,0.384735,0.338217,0.0,0.062589,0.375,0.375,0.375,0.375,0.375,0.375,1.0,0.0
50%,0.396931,0.514892,0.4537,0.0,0.156723,0.504929,0.504247,0.499945,0.480602,0.488368,0.505726,1.0,0.0
75%,0.584143,0.641226,0.60293,0.4,0.437554,0.625,0.625,0.625,0.625,0.625,0.625,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
dfx.to_csv('../csv_files/p1minmax.csv')