In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Calculate z-scores and filter out rows where z-scores exceed a threshold
def remove_outliers(df, threshold=3):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    z_scores = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std())
    return df[(z_scores < threshold).all(axis=1)]

# Load the dataset
data_path = '/content/Medical Data.csv'
medical_data = pd.read_csv(data_path)

print(f"Initial number of rows: {len(medical_data)}\n")

# Step 1: Check for null values and remove rows with any null values
print("Checking for null values...")
print(medical_data.isnull().sum())  # Display the count of null values for each column

# Remove rows with null values
medical_data_cleaned = medical_data.dropna()
print(f"\nRows after removing null values: {len(medical_data_cleaned)}\n")

# Step 2: Identify and remove outliers
medical_data_no_outliers = remove_outliers(medical_data_cleaned)
print(f"Rows after removing outliers: {len(medical_data_no_outliers)}\n")

# Step 3: Remove irrelevant columns (participant ID)
columns_to_drop = ['id']
medical_data_final = medical_data_no_outliers.drop(columns=columns_to_drop, errors='ignore')
print(f"Columns after removing irrelevant ones: {list(medical_data_final.columns)}\n")

# Save the cleaned dataset for further use
output_path = '/content/Cleaned_Medical_Data.csv'
medical_data_final.to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")


Initial number of rows: 886

Checking for null values...
id           0
age          0
year         0
sex          0
glang        0
part         0
job          0
stud_h       0
health       0
psyt         0
jspe         0
qcae_cog     0
qcae_aff     0
amsp         0
erec_mean    0
cesd         0
stai_t       0
mbi_ex       0
mbi_cy       0
mbi_ea       0
dtype: int64

Rows after removing null values: 886

Rows after removing outliers: 830

Columns after removing irrelevant ones: ['age', 'year', 'sex', 'glang', 'part', 'job', 'stud_h', 'health', 'psyt', 'jspe', 'qcae_cog', 'qcae_aff', 'amsp', 'erec_mean', 'cesd', 'stai_t', 'mbi_ex', 'mbi_cy', 'mbi_ea']

Cleaned data saved to /content/Cleaned_Medical_Data.csv
