In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression


In [2]:
df=pd.read_csv("heat.csv")

In [3]:
df.head()

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result
0,63,1,66,160,83,160.0,1.8,0.012,negative
1,20,1,94,98,46,296.0,6.75,1.06,positive
2,56,1,64,160,77,270.0,1.99,0.003,negative
3,66,1,70,120,55,270.0,13.87,0.122,positive
4,54,1,64,112,65,300.0,1.08,0.003,negative


In [4]:
# from sklearn.model_selection import train_test_split
# train_set, test_set  = train_test_split(df, test_size=0.2, random_state=42)
# print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
labels = ['positive','negative']
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
print("Encoded labels:", encoded_labels)
encoded_target = label_encoder.fit_transform(df['Result'])

Encoded labels: [1 0]


In [7]:
encoded_target 

array([0, 1, 0, ..., 1, 1, 1])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Result']),  # Features
                                                    encoded_target,                   # Encoded target variable
                                                    test_size=0.2,
                                                    random_state=42)

In [9]:
X_test.head()

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin
677,76,1,73,114,68,144.0,297.5,0.024
1046,30,0,68,91,61,93.0,3.93,0.003
610,50,1,63,98,57,111.0,2.55,0.006
49,38,0,80,152,78,133.0,1.19,0.003
1284,29,1,81,150,51,100.0,6.48,0.003


In [10]:
trf1 = ColumnTransformer([
    ('impute_Age',SimpleImputer(),[0]),
    ('impute_Troponin',SimpleImputer(strategy='most_frequent'),[7])
],remainder='passthrough')

In [11]:
trf2 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,3))
])

In [12]:
trf3 = SelectKBest(score_func=chi2,k='all')

In [13]:
# trf11 = LinearRegression()
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
trf4=RandomForestClassifier()
# trf4=DecisionTreeClassifier()
# trf4=LogisticRegression()

In [14]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4)
])    

In [15]:
pipe.fit(X_train,y_train)

In [16]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_Age', SimpleImputer(), [0]),
                                 ('impute_Troponin',
                                  SimpleImputer(strategy='most_frequent'),
                                  [7])]),
 'trf2': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 3, None))]),
 'trf3': SelectKBest(k='all', score_func=<function chi2 at 0x00000284D18D60C0>),
 'trf4': RandomForestClassifier()}

In [17]:
from sklearn import set_config
set_config(display='diagram')

In [18]:
y_pred = pipe.predict(X_test)

In [19]:
y_pred

array([1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1])

In [43]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score

from sklearn.preprocessing import PowerTransformer
lr=RandomForestClassifier()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)

r2_score(y_test,y_pred)

0.9198202028791836

In [45]:
# lr=LinearRegression()
# np.mean(cross_val_score(lr,X_train,y_train,scoring='r2'))

In [46]:
pt = PowerTransformer(method='box-cox')

X_train_transformed = pt.fit_transform(X_train+0.000001)
X_test_transformed = pt.transform(X_test+0.000001)

pd.DataFrame({'cols':X_train.columns,'box_cox_lambdas':pt.lambdas_})

Unnamed: 0,cols,box_cox_lambdas
0,Age,1.278035
1,Gender,0.147133
2,Heart rate,-0.468275
3,Systolic blood pressure,0.167947
4,Diastolic blood pressure,0.273634
5,Blood sugar,-0.824089
6,CK-MB,-0.376004
7,Troponin,-0.272247


In [48]:
pt = PowerTransformer()
X_transformed2 = pt.fit_transform(X_train)
from sklearn.tree import DecisionTreeRegressor
# lr = LinearRegression()
lr=DecisionTreeClassifier()
np.mean(cross_val_score(lr,X_transformed2,y_train,scoring='r2'))

0.968084704102855

In [50]:
pt = PowerTransformer()
X_transformed2 = pt.fit_transform(X_train)
from sklearn.tree import DecisionTreeClassifier
lr = RandomForestClassifier()
# lr=DecisionTreeClassifier()
np.mean(cross_val_score(lr,X_transformed2,y_train,scoring='r2'))

0.9441118614724211

In [51]:
# from sklearn.metrics import accuracy_score
# accuracy_score(y_test,y_pred)

In [52]:
params = {
    'trf4':[1,2,3,4,5,None]
}

In [53]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

In [54]:
from sklearn.metrics import precision_score,recall_score,f1_score

In [40]:
# precision_score(y_test,y_pred,average='macro')

ValueError: Classification metrics can't handle a mix of binary and continuous targets