In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Step 1: Load Dataset
file_path = 'Heart.csv'
df = pd.read_csv(file_path)

In [4]:
# Step 2: Inspect Data 
print('Data Info:')
print(df.info())
print('\nData Stats:')
print(df.describe())
print('\nFirst 5 rows:')
print(df.head())

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  303 non-null    int64  
 1   Age         303 non-null    int64  
 2   Sex         303 non-null    int64  
 3   ChestPain   303 non-null    object 
 4   RestBP      303 non-null    int64  
 5   Chol        303 non-null    int64  
 6   Fbs         303 non-null    int64  
 7   RestECG     303 non-null    int64  
 8   MaxHR       303 non-null    int64  
 9   ExAng       303 non-null    int64  
 10  Oldpeak     303 non-null    float64
 11  Slope       303 non-null    int64  
 12  Ca          299 non-null    float64
 13  Thal        301 non-null    object 
 14  AHD         303 non-null    object 
dtypes: float64(2), int64(10), object(3)
memory usage: 35.6+ KB
None

Data Stats:
       Unnamed: 0         Age         Sex      RestBP        Chol         Fbs  \
count  303.000000  303.0

In [5]:
# Step 3: Save Data in Various Formats
df.to_excel('dataset.xlsx', index=False)
df.to_json('dataset.json', orient='records', indent=2)

from sqlalchemy import create_engine
engine = create_engine("sqlite:///:memory:")
df.to_sql('dataset', con=engine, index=False, if_exists='replace')

303

In [6]:
# Step 4: Reload Data to Verify 
df_excel = pd.read_excel('dataset.xlsx')
df_json = pd.read_json('dataset.json')
df_sql = pd.read_sql('dataset', con=engine)

In [7]:
# Step 5: Clean and Prepare Data
# Handle Missing Values
df.fillna(method='ffill', inplace=True)
# Remove Duplicates
df.drop_duplicates(inplace=True)
# Normalize Numerical Data 
scaler = MinMaxScaler()
numerical_cols = df.select_dtypes(include=['number']).columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
# Encode Categorical Data
df = pd.get_dummies(df, drop_first=True)
# Final Dataset
print('Cleaned Data:')
print(df.head())
# Save the cleaned dataset
df.to_csv('cleaned_dataset.csv', index=False)

Cleaned Data:
   Unnamed: 0       Age  Sex    RestBP      Chol  Fbs  RestECG     MaxHR  \
0    0.000000  0.708333  1.0  0.481132  0.244292  1.0      1.0  0.603053   
1    0.003311  0.791667  1.0  0.622642  0.365297  0.0      1.0  0.282443   
2    0.006623  0.791667  1.0  0.245283  0.235160  0.0      1.0  0.442748   
3    0.009934  0.166667  1.0  0.339623  0.283105  0.0      0.0  0.885496   
4    0.013245  0.250000  0.0  0.339623  0.178082  0.0      1.0  0.770992   

   ExAng   Oldpeak  Slope        Ca  ChestPain_nonanginal  \
0    0.0  0.370968    1.0  0.000000                 False   
1    1.0  0.241935    0.5  1.000000                 False   
2    1.0  0.419355    0.5  0.666667                 False   
3    0.0  0.564516    1.0  0.000000                  True   
4    0.0  0.225806    0.0  0.000000                 False   

   ChestPain_nontypical  ChestPain_typical  Thal_normal  Thal_reversable  \
0                 False               True        False            False   
1         

  df.fillna(method='ffill', inplace=True)
