In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('1167330_AeryunPark.csv')

In [None]:
# Check dataset shape and structure
print ("Dataset Shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

Dataset Shape: (1000, 20)

Column names:
['Patient_ID', 'Age', 'Gender', 'Weight_kg', 'Height_cm', 'BMI', 'Disease_Type', 'Severity', 'Physical_Activity_Level', 'Daily_Caloric_Intake', 'Cholesterol_mg/dL', 'Blood_Pressure_mmHg', 'Glucose_mg/dL', 'Dietary_Restrictions', 'Allergies', 'Preferred_Cuisine', 'Weekly_Exercise_Hours', 'Adherence_to_Diet_Plan', 'Dietary_Nutrient_Imbalance_Score', 'Diet_Recommendation']


In [None]:
# Display first few rows
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Weight_kg,Height_cm,BMI,Disease_Type,Severity,Physical_Activity_Level,Daily_Caloric_Intake,Cholesterol_mg/dL,Blood_Pressure_mmHg,Glucose_mg/dL,Dietary_Restrictions,Allergies,Preferred_Cuisine,Weekly_Exercise_Hours,Adherence_to_Diet_Plan,Dietary_Nutrient_Imbalance_Score,Diet_Recommendation
0,P0001,56,Male,58.4,160,22.8,Obesity,Moderate,Moderate,3079,173.3,133,116.3,,Peanuts,Mexican,3.1,96.6,3.1,Balanced
1,P0002,69,Male,101.2,169,35.4,Diabetes,Mild,Moderate,3032,199.2,120,137.1,,Peanuts,Chinese,4.5,63.2,0.6,Low_Carb
2,P0003,46,Female,63.5,173,21.2,Hypertension,Mild,Sedentary,1737,181.0,121,109.6,,Peanuts,Chinese,3.8,57.5,4.6,Low_Sodium
3,P0004,32,Male,58.1,164,21.6,,Mild,Moderate,2657,168.2,144,159.4,,,Mexican,4.3,54.5,0.4,Balanced
4,P0005,60,Male,79.5,197,20.5,Diabetes,Moderate,Sedentary,3496,200.4,172,182.3,Low_Sugar,,Italian,9.8,78.2,4.7,Low_Carb


In [None]:
df.columns.tolist()

['Patient_ID',
 'Age',
 'Gender',
 'Weight_kg',
 'Height_cm',
 'BMI',
 'Disease_Type',
 'Severity',
 'Physical_Activity_Level',
 'Daily_Caloric_Intake',
 'Cholesterol_mg/dL',
 'Blood_Pressure_mmHg',
 'Glucose_mg/dL',
 'Dietary_Restrictions',
 'Allergies',
 'Preferred_Cuisine',
 'Weekly_Exercise_Hours',
 'Adherence_to_Diet_Plan',
 'Dietary_Nutrient_Imbalance_Score',
 'Diet_Recommendation']

In [None]:
df.dtypes

Patient_ID                           object
Age                                   int64
Gender                               object
Weight_kg                           float64
Height_cm                             int64
BMI                                 float64
Disease_Type                         object
Severity                             object
Physical_Activity_Level              object
Daily_Caloric_Intake                  int64
Cholesterol_mg/dL                   float64
Blood_Pressure_mmHg                   int64
Glucose_mg/dL                       float64
Dietary_Restrictions                 object
Allergies                            object
Preferred_Cuisine                    object
Weekly_Exercise_Hours               float64
Adherence_to_Diet_Plan              float64
Dietary_Nutrient_Imbalance_Score    float64
Diet_Recommendation                  object
dtype: object

In [None]:
# Get basic statistics
df.describe()

Unnamed: 0,Age,Weight_kg,Height_cm,BMI,Daily_Caloric_Intake,Cholesterol_mg/dL,Blood_Pressure_mmHg,Glucose_mg/dL,Weekly_Exercise_Hours,Adherence_to_Diet_Plan,Dietary_Nutrient_Imbalance_Score
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,49.857,84.6024,174.817,28.1916,2475.064,199.7179,144.993,136.8676,5.166,74.8843,2.4692
std,18.114267,20.088121,14.33377,8.040136,565.017032,29.080614,20.245712,37.934819,2.847995,14.82638,1.459631
min,18.0,50.0,150.0,13.0,1500.0,150.4,110.0,70.2,0.0,50.0,0.0
25%,35.0,66.6,162.0,22.075,1984.75,174.3,128.0,105.0,2.8,62.0,1.2
50%,50.0,85.2,175.0,27.45,2470.5,199.85,145.0,138.0,5.2,74.2,2.4
75%,66.0,102.0,187.0,33.425,2937.25,224.85,163.0,170.65,7.6,88.2,3.7
max,79.0,119.7,199.0,52.4,3498.0,249.9,179.0,200.0,10.0,100.0,5.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Patient_ID                        1000 non-null   object 
 1   Age                               1000 non-null   int64  
 2   Gender                            1000 non-null   object 
 3   Weight_kg                         1000 non-null   float64
 4   Height_cm                         1000 non-null   int64  
 5   BMI                               1000 non-null   float64
 6   Disease_Type                      796 non-null    object 
 7   Severity                          1000 non-null   object 
 8   Physical_Activity_Level           1000 non-null   object 
 9   Daily_Caloric_Intake              1000 non-null   int64  
 10  Cholesterol_mg/dL                 1000 non-null   float64
 11  Blood_Pressure_mmHg               1000 non-null   int64  
 12  Glucose

In [None]:
# Check for missing values
df.isnull().sum()

Patient_ID                            0
Age                                   0
Gender                                0
Weight_kg                             0
Height_cm                             0
BMI                                   0
Disease_Type                        204
Severity                              0
Physical_Activity_Level               0
Daily_Caloric_Intake                  0
Cholesterol_mg/dL                     0
Blood_Pressure_mmHg                   0
Glucose_mg/dL                         0
Dietary_Restrictions                334
Allergies                           323
Preferred_Cuisine                     0
Weekly_Exercise_Hours                 0
Adherence_to_Diet_Plan                0
Dietary_Nutrient_Imbalance_Score      0
Diet_Recommendation                   0
dtype: int64

In [None]:
# Check missing value patterns
missing_cols = ['Disease_Type', 'Dietary_Restrictions', 'Allergies']
for col in missing_cols:
    print(f"{col} unique value:")
    print(df[col].value_counts(dropna=False))
    print("-"*30)

Disease_Type unique value:
Disease_Type
Hypertension    316
Diabetes        258
Obesity         222
NaN             204
Name: count, dtype: int64
------------------------------
Dietary_Restrictions unique value:
Dietary_Restrictions
Low_Sodium    350
NaN           334
Low_Sugar     316
Name: count, dtype: int64
------------------------------
Allergies unique value:
Allergies
Gluten     346
Peanuts    331
NaN        323
Name: count, dtype: int64
------------------------------


In [None]:
# Change missing values to None
df['Disease_Type'] = df['Disease_Type'].fillna('None')
df['Dietary_Restrictions'] = df['Dietary_Restrictions'].fillna('None')
df['Allergies'] = df['Allergies'].fillna('None')

df.isnull().sum()

Patient_ID                          0
Age                                 0
Gender                              0
Weight_kg                           0
Height_cm                           0
BMI                                 0
Disease_Type                        0
Severity                            0
Physical_Activity_Level             0
Daily_Caloric_Intake                0
Cholesterol_mg/dL                   0
Blood_Pressure_mmHg                 0
Glucose_mg/dL                       0
Dietary_Restrictions                0
Allergies                           0
Preferred_Cuisine                   0
Weekly_Exercise_Hours               0
Adherence_to_Diet_Plan              0
Dietary_Nutrient_Imbalance_Score    0
Diet_Recommendation                 0
dtype: int64

In [13]:
# Check Patient_ID duplicates first (most important)
patient_duplicates = df['Patient_ID'].duplicated().sum()
print(f"Patient_ID Duplicate Count: {patient_duplicates}")
print("*" * 50)

Patient_ID Duplicate Count: 0
**************************************************


In [15]:
# Optional : Quick check for suspicious duplicates
suspicious_cols = ['Weight_kg', 'Height_cm', 'BMI']
for col in suspicious_cols:
    if col in df.columns:
        dup_count = df[col].duplicated().sum()
        print(f"{col}: {dup_count} duplicates")

Weight_kg: 462 duplicates
Height_cm: 950 duplicates
BMI: 686 duplicates


In [16]:
# Check if these duplicates are normal
print("== Duplicate Analysis ==")
total_rows = len(df)
print(f"Total patients: {total_rows}")

cols_to_check = ['Weight_kg', 'Height_cm', 'BMI']
for col in cols_to_check:
    if col in df.columns:
        dup_count = df[col].duplicated().sum()
        percentage = (dup_count / total_rows) * 100
        print(f"{col}: {dup_count} duplicates ({percentage:.1f})")

== Duplicate Analysis ==
Total patients: 1000
Weight_kg: 462 duplicates (46.2)
Height_cm: 950 duplicates (95.0)
BMI: 686 duplicates (68.6)


In [17]:
# Check actual height values and their frequencies
print("== Height Value Analysis ==")
print("Most common heights:")
print(df['Height_cm'].value_counts().head(10))

print(f"\nTotal unique height: {df['Height_cm'].nunique()}")
print(f"Height range: {df['Height_cm'].min()} - {df['Height_cm'].max()}")

== Height Value Analysis ==
Most common heights:
Height_cm
170    28
177    28
190    27
184    27
168    26
198    26
183    26
160    25
157    25
196    25
Name: count, dtype: int64

Total unique height: 50
Height range: 150 - 199


In [20]:
#Check for constant feature (columns where all values are the same)
constant_features = [col for col in df.columns if df[col].nunique() == 1]
print("Constant features:", constant_features)

Constant features: []


In [22]:
# IQR method for outlier detection
def find_outliers_IQR_method(input_df, variable):
    IQR = input_df[variable].quantile(0.75) - input_df[variable].quantile(0.25)
    lower_limit = input_df[variable].quantile(0.25) - (IQR*1.5)
    upper_limit = input_df[variable].quantile(0.75) + (IQR*1.5)
    return lower_limit, upper_limit

# Check outliers for numerical columns
numeriacl_cols = ['Age', 'Weight_kg', 'Height_cm', 'BMI', 'Daily_Caloric_Intake', 'Cholesterol_mg_dL', 
                  'Blood_Pressure_mmHg', 'Clucose_mg_dL', 'Weekly_Exercise_Hours']

for col in numeriacl_cols:
    if col in df.columns:
        lower, upper = find_outliers_IQR_method(df, col)
        outliers_count = len(df[(df[col] < lower) | (df[col] > upper)])
        print(f"{col}: {outliers_count} outliers (range: {lower:.1f} - {upper:.1f})")
        print("*"*50)

Age: 0 outliers (range: -11.5 - 112.5)
**************************************************
Weight_kg: 0 outliers (range: 13.5 - 155.1)
**************************************************
Height_cm: 0 outliers (range: 124.5 - 224.5)
**************************************************
BMI: 4 outliers (range: 5.1 - 50.4)
**************************************************
Daily_Caloric_Intake: 0 outliers (range: 556.0 - 4366.0)
**************************************************
Blood_Pressure_mmHg: 0 outliers (range: 75.5 - 215.5)
**************************************************
Weekly_Exercise_Hours: 0 outliers (range: -4.4 - 14.8)
**************************************************


In [25]:
# Show outlier details with related columns
print("\nBMI Outlier Details:")
outlier_details = bmi_outliers[['Patient_ID', 'BMI', 'Weight_kg', 'Height_cm', 'Age', 'Disease_Type']].copy()
print(outlier_details.sort_values('BMI'))


BMI Outlier Details:
    Patient_ID   BMI  Weight_kg  Height_cm  Age  Disease_Type
455      P0456  51.0      114.8        150   36  Hypertension
323      P0324  51.4      117.3        151   23  Hypertension
838      P0839  52.0      118.5        151   57      Diabetes
830      P0831  52.4      119.5        151   75       Obesity


In [26]:
# Verify BMI calculation in your dataset
print("== BMI Calculation Verification ==")
outlier_patients = ['P0456', 'P0324', 'P0839', 'P0831']

for patient_id in outlier_patients:
    patient_data = df[df['Patient_ID'] == patient_id]
    if not patient_data.empty:
        weight = patient_data['Weight_kg'].iloc[0]
        height_cm = patient_data['Height_cm'].iloc[0]
        height_m = height_cm / 100
        calculated_bmi = weight / (height_m ** 2)
        recorded_bmi = patient_data['BMI'].iloc[0]
        
        print(f"Patient {patient_id}:")
        print(f"  Calculated BMI: {calculated_bmi:.1f}")
        print(f"  Recorded BMI: {recorded_bmi:.1f}")
        print(f"  Match: {abs(calculated_bmi - recorded_bmi) < 0.1}")
        print()

== BMI Calculation Verification ==
Patient P0456:
  Calculated BMI: 51.0
  Recorded BMI: 51.0
  Match: True

Patient P0324:
  Calculated BMI: 51.4
  Recorded BMI: 51.4
  Match: True

Patient P0839:
  Calculated BMI: 52.0
  Recorded BMI: 52.0
  Match: True

Patient P0831:
  Calculated BMI: 52.4
  Recorded BMI: 52.4
  Match: True

