In [38]:
import pandas as pd

data = pd.read_csv('cardio_dataset-original.csv')

# Step 1 - Handling Outliers

In [9]:
factor = 4

upper_lim = data['AGEIR'].mean () + data['AGEIR'].std () * factor
lower_lim = data['AGEIR'].mean () - data['AGEIR'].std () * factor

data_new = data[(data['AGEIR'] < upper_lim) & (data['AGEIR'] > lower_lim)]

In [10]:
upper_lim = data_new['TC'].mean () + data_new['TC'].std () * factor
lower_lim = data_new['TC'].mean () - data_new['TC'].std () * factor

data_new = data_new[(data_new['TC'] < upper_lim) & (data_new['TC'] > lower_lim)]

In [11]:
upper_lim = data_new['HDL'].mean () + data_new['HDL'].std () * factor
lower_lim = data_new['HDL'].mean () - data_new['HDL'].std () * factor

data_new = data_new[(data_new['HDL'] < upper_lim) & (data_new['HDL'] > lower_lim)]

In [12]:
upper_lim = data_new['RISK'].mean () + data_new['RISK'].std () * factor
lower_lim = data_new['RISK'].mean () - data_new['RISK'].std () * factor

data_new = data_new[(data_new['RISK'] < upper_lim) & (data_new['RISK'] > lower_lim)]

# Step 2 - Categorical Encoding

In [13]:
data_new["SEX"] = data_new["SEX"].astype('category')
data_new["SMOKE_"] = data_new["SMOKE_"].astype('category')
data_new["BPMED"] = data_new["BPMED"].astype('category')
data_new["DIAB_noyes"] = data_new["DIAB_noyes"].astype('category')
print(data_new.dtypes)

SEX           category
AGEIR            int64
TC               int64
HDL              int64
SMOKE_        category
BPMED         category
DIAB_noyes    category
RISK           float64
dtype: object


In [18]:
data_new["SEX"] = data_new["SEX"].cat.codes
data_new["SMOKE_"] = data_new["SMOKE_"].cat.codes
data_new["BPMED"] = data_new["BPMED"].cat.codes
data_new["DIAB_noyes"] = data_new["DIAB_noyes"].cat.codes
data_new.head()

Unnamed: 0,SEX,AGEIR,TC,HDL,SMOKE_,BPMED,DIAB_noyes,RISK
0,0,48,236,66,0,1,0,1.1
1,1,48,260,51,0,1,1,7.0
2,1,44,187,49,1,1,0,7.0
3,0,42,216,57,1,1,0,0.4
4,0,56,156,42,0,1,0,2.2


# Step 3 - Scaling

In [19]:
dataset=data_new.values #numpy

data=dataset[:,:7]
target=dataset[:,7]

print(data)

[[  0.  48. 236. ...   0.   1.   0.]
 [  1.  48. 260. ...   0.   1.   1.]
 [  1.  44. 187. ...   1.   1.   0.]
 ...
 [  1.  65. 212. ...   1.   0.   0.]
 [  1.  66. 184. ...   0.   0.   0.]
 [  0.  45. 203. ...   0.   1.   0.]]


In [20]:
from sklearn.preprocessing import QuantileTransformer

model_qntl_data = QuantileTransformer(output_distribution='normal', random_state=0)
data_scaled = model_qntl_data.fit_transform(data)

In [40]:
import numpy as np
model_qntl_target = QuantileTransformer(output_distribution='normal', random_state=0)
target_scaled = model_qntl_target.fit_transform(target.reshape(-1,1))
print(np.max(target_scaled),np.min(target_scaled))

5.19933758270342 -5.199337582605575


# Step 4 - Polynomial Features

In [22]:
from sklearn.preprocessing import PolynomialFeatures

model_poly=PolynomialFeatures(degree=3,include_bias=False)
data_high=model_poly.fit_transform(data_scaled)

# Step 5 - Training

In [35]:
from sklearn.model_selection import train_test_split

train_data,test_data,train_target,test_target=train_test_split(data_high,target_scaled,test_size=0.1)

In [36]:
from sklearn.linear_model import LinearRegression

model=LinearRegression()

model.fit(train_data,train_target)
predicted_target=model.predict(test_data)

In [37]:
from sklearn.metrics import r2_score

r2=r2_score(test_target,predicted_target)
print("r2 score:",r2)

r2 score: 0.9115607650538162


# Step 6 - Save all the model files

In [41]:
import joblib

joblib.dump(model,'heart_risk_regression.sav')
joblib.dump(model_poly,'model_poly.sav')
joblib.dump(model_qntl_data,'model_qntl_data.sav')
joblib.dump(model_qntl_target,'model_qntl_target.sav')

['model_qntl_target.sav']