#Naming Columns

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
drive.mount('/content/drive')

input_path = "/content/drive/My Drive/Rtfp/processed.cleveland.data"

column_names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
                "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"]
df = pd.read_csv(input_path,names=column_names)
output_path = "/content/drive/My Drive/Rtfp/cleveland_read.csv"
df["target"] = df["target"].apply(lambda x: 1 if x > 0 else 0)
df.to_csv(output_path)


Mounted at /content/drive


#Feature Selection


In [None]:
selected_features = ["age", "sex", "cp", "thalach", "exang", "oldpeak",
                     "slope", "ca", "thal", "chol", "fbs", "target"]

df_selected = df[selected_features]
output_path_selected="/content/drive/My Drive/Rtfp/selected_cleveland.csv"

df_selected.to_csv(output_path_selected, index=False)


#Handling Missing Values


In [None]:

df_selected.replace(r'(?i)^unknown$|^\s*$|^\?$', pd.NA, regex=True, inplace=True)
df_selected.dropna(inplace=True)
df_selected.to_csv(output_path_selected, index=False)
print("Leftover '?' in df_selected:", (df_selected == '?').sum().sum())
print("Null values after cleaning:", df_selected.isnull().sum().sum())
print("Cleaned shape:", df_selected.shape)



Leftover '?' in df_selected: 0
Null values after cleaning: 0
Cleaned shape: (297, 12)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.replace(r'(?i)^unknown$|^\s*$|^\?$', pd.NA, regex=True, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.dropna(inplace=True)


Encoding and Scaling

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd


selected_features = ["age", "sex", "cp", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "chol", "fbs", "target"]
df_selected = df_selected[selected_features]


df_selected = pd.get_dummies(df_selected, columns=["cp", "thal"])

for col in ["slope", "ca"]:
    if df_selected[col].dtype == 'object':
        df_selected[col] = LabelEncoder().fit_transform(df_selected[col])

scaler = StandardScaler()
df_selected[["age", "thalach", "oldpeak", "chol"]] = scaler.fit_transform(df_selected[["age", "thalach", "oldpeak", "chol"]])
if 'thal_?' in df.columns:
    df = df[df['thal_?'] != 1]
    df.drop(columns=['thal_?'], inplace=True)


df_selected.to_csv("/content/drive/My Drive/Rtfp/final_cleveland.csv", index=False)

print("Null values after processing:", df_selected.isnull().sum())


Null values after processing: age         0
sex         0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
chol        0
fbs         0
target      0
cp_1.0      0
cp_2.0      0
cp_3.0      0
cp_4.0      0
thal_3.0    0
thal_6.0    0
thal_7.0    0
dtype: int64


In [None]:
import joblib
joblib.dump(scaler, '/content/drive/My Drive/Rtfp/models/scaler.pkl')

['/content/drive/My Drive/Rtfp/models/scaler.pkl']