In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
     
df = pd.read_csv("UpdatedDengue Fever.csv")
df.head(3)

Unnamed: 0,sub_id,sex,age,wgt,Height (in meter ),Body Mass Index (BMI),fever,platelet,hemato,wbc,...,s_rash,headache,abdo_pain,vomit,bleed,ns1_anti,igm_anti,igg_anti,hospi,target
0,DEN_0001,M,36,14,,,1,20000,40,6000,...,0,1,1,1,0,0,1,1,1,1
1,DEN_0002,M,48,15,,,1,18000,38,5500,...,0,0,0,1,0,1,0,0,1,1
2,DEN_0003,F,24,12,,,1,13000,45,6300,...,0,1,0,1,0,1,0,0,1,1


In [3]:
# make column name prefix
df.columns = df.columns.str.replace(' ', '_').str.lower()
print(df.columns)

Index(['sub_id', 'sex', 'age', 'wgt', 'height_(in_meter_)',
       'body_mass_index_(bmi)', 'fever', 'platelet', 'hemato', 'wbc', 'j_pain',
       's_rash', 'headache_', 'abdo_pain', 'vomit', 'bleed', 'ns1_anti',
       'igm_anti', 'igg_anti', 'hospi', 'target'],
      dtype='object')


In [4]:
# Define a function to calculate height based on weight
def calculate_height(weight):
    height = weight * 10.0  #  Assume height is 10 times the weight
    return height

df['height_(in_meter_)'] = df['wgt'].apply(calculate_height)

In [5]:
df.head(2)

Unnamed: 0,sub_id,sex,age,wgt,height_(in_meter_),body_mass_index_(bmi),fever,platelet,hemato,wbc,...,s_rash,headache_,abdo_pain,vomit,bleed,ns1_anti,igm_anti,igg_anti,hospi,target
0,DEN_0001,M,36,14,140.0,,1,20000,40,6000,...,0,1,1,1,0,0,1,1,1,1
1,DEN_0002,M,48,15,150.0,,1,18000,38,5500,...,0,0,0,1,0,1,0,0,1,1


In [6]:
# Define a function to calculate BMI
def calculate_bmi(row):
    weight_kg = row['wgt']  # Replace with your weight column name
    height_m = row['height_(in_meter_)']  # Replace with your height column name
    
    # Calculate BMI
    bmi = weight_kg / (height_m ** 2)
    return bmi

# Apply the calculate_bmi function to create a new 'body_mass_index_(bmi)' column
df['body_mass_index_(bmi)'] = df.apply(calculate_bmi, axis=1)

In [7]:
df.head(3)

Unnamed: 0,sub_id,sex,age,wgt,height_(in_meter_),body_mass_index_(bmi),fever,platelet,hemato,wbc,...,s_rash,headache_,abdo_pain,vomit,bleed,ns1_anti,igm_anti,igg_anti,hospi,target
0,DEN_0001,M,36,14,140.0,0.000714,1,20000,40,6000,...,0,1,1,1,0,0,1,1,1,1
1,DEN_0002,M,48,15,150.0,0.000667,1,18000,38,5500,...,0,0,0,1,0,1,0,0,1,1
2,DEN_0003,F,24,12,120.0,0.000833,1,13000,45,6300,...,0,1,0,1,0,1,0,0,1,1


In [8]:
df_encoded = pd.get_dummies(df, columns=['sex'], prefix=['sex'])

# Convert the boolean columns to numeric (1 and 0)
df_encoded['sex_F'] = df_encoded['sex_F'].astype(int)
df_encoded['sex_M'] = df_encoded['sex_M'].astype(int)

# # Add the 'sex_F' and 'sex_M' columns back to the original dataset
df['sex_F'] = df_encoded['sex_F']
df['sex_M'] = df_encoded['sex_M']

In [9]:
df.head(3)

Unnamed: 0,sub_id,sex,age,wgt,height_(in_meter_),body_mass_index_(bmi),fever,platelet,hemato,wbc,...,abdo_pain,vomit,bleed,ns1_anti,igm_anti,igg_anti,hospi,target,sex_F,sex_M
0,DEN_0001,M,36,14,140.0,0.000714,1,20000,40,6000,...,1,1,0,0,1,1,1,1,0,1
1,DEN_0002,M,48,15,150.0,0.000667,1,18000,38,5500,...,0,1,0,1,0,0,1,1,0,1
2,DEN_0003,F,24,12,120.0,0.000833,1,13000,45,6300,...,0,1,0,1,0,0,1,1,1,0


In [10]:
age_groups = {
    'infants': (0, 24),            # Age <= 2 years (0-24 months)
    'toddlers': (25, 60),          # 2 years < Age <= 5 years (25-60 months)
    'preschoolers': (61, 84),     # 5 years < Age <= 7 years (61-84 months)
    'school-age': (85, 144)       # 7 years < Age <= 12 years (85-144 months)
}
def assign_age_group(age_in_months):
    for group, (lower, upper) in age_groups.items():
        if lower <= age_in_months <= upper:
            return group
    return 'unknown'  # Assign 'unknown' if age doesn't fall into any group
df['Age Groups'] = df['age'].apply(assign_age_group)

In [11]:
df.describe()

Unnamed: 0,age,wgt,height_(in_meter_),body_mass_index_(bmi),fever,platelet,hemato,wbc,j_pain,s_rash,...,abdo_pain,vomit,bleed,ns1_anti,igm_anti,igg_anti,hospi,target,sex_F,sex_M
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,76.024,19.278,192.78,0.000557,0.946,31522.0,31.46,3898.85,0.338,0.202,...,0.342,0.696,0.206,0.428,0.19,0.106,0.622,0.866,0.396,0.604
std,41.421963,4.68921,46.892103,0.000171,0.226244,20296.438279,7.526067,2012.483567,0.473502,0.401894,...,0.474855,0.460443,0.404836,0.495284,0.392694,0.308146,0.485373,0.340993,0.489554,0.489554
min,4.0,6.0,60.0,0.000333,0.0,5000.0,12.0,1100.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,34.0,16.75,167.5,0.000455,1.0,16000.0,26.0,2000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,75.0,19.0,190.0,0.000526,1.0,25000.0,34.0,3600.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
75%,111.0,22.0,220.0,0.000597,1.0,38250.0,37.0,5600.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
max,144.0,30.0,300.0,0.001667,1.0,96000.0,45.0,8000.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# handeling age group categorical value into numerical value 
df = pd.get_dummies(df, columns=['Age Groups'], prefix='Age_Group')
# # Convert boolean values (True/False) to integers (1/0) for one-hot encoded columns
df['Age_Group_infants'] = df['Age_Group_infants'].astype(int)
df['Age_Group_toddlers'] = df['Age_Group_toddlers'].astype(int)
df['Age_Group_preschoolers'] = df['Age_Group_preschoolers'].astype(int)
df['Age_Group_school-age'] = df['Age_Group_school-age'].astype(int)

In [13]:
# Define a list of symptom columns
symptom_columns = ['fever', 'j_pain', 's_rash', 'headache_', 'abdo_pain', 'vomit', 'bleed']
# Create a new column 'Symptom Count' that sums the symptom columns for each child
df['Symptom Count'] = df[symptom_columns].sum(axis=1)

In [14]:
# Blood Counts features creations
platelet_weight = 0.4
hemato_weight = 0.3
wbc_weight = 0.3
# Create a new column 'Blood Health' as a weighted sum of the blood count components
df['Blood Health'] = (
    df['platelet'] * platelet_weight +
    df['hemato'] * hemato_weight +
    df['wbc'] * wbc_weight 
)

In [15]:
# Symptom Patterns: Creating features that capture specific symptom patterns can help identify common presentation patterns in pediatric Dengue cases.
# Define symptom patterns (create new columns for each pattern)
df['Fever_Headache'] = (df['fever'] & df['headache_']).astype(int)
df['Fever_JointPain'] = (df['fever'] & df['j_pain']).astype(int)
df['Fever_SkinRash'] = (df['fever'] & df['s_rash']).astype(int)
df['Vomit_Abdo_Pain'] = (df['vomit'] & df['abdo_pain']).astype(int)
df['Bleed_Headache'] = (df['bleed'] & df['headache_']).astype(int)

In [16]:
# Combine antibody response columns into a single feature
df['Antibody_Response'] = df['ns1_anti'] + df['igm_anti'] + df['igg_anti']

In [17]:
# Assuming that a non-null value in the 'hospi' column indicates hospitalization
df['Hospitalized'] = df['hospi'].notnull().astype(int)

In [18]:
# Define a list of symptom columns
symptom_columns = ['fever', 'j_pain', 's_rash', 'headache_', 'abdo_pain', 'vomit', 'bleed']
threshold = 2
df['Multiple_Symptoms'] = (df[symptom_columns].sum(axis=1) >= threshold).astype(int)

In [19]:
# Hematocrit and Platelet Ratios:
hct_column = 'hemato'
platelet_column = 'platelet'
df['HPR'] = df[hct_column] / df[platelet_column]

In [23]:
df.head(3)

Unnamed: 0,age,wgt,height_(in_meter_),body_mass_index_(bmi),fever,platelet,hemato,wbc,j_pain,s_rash,...,Blood Health,Fever_Headache,Fever_JointPain,Fever_SkinRash,Vomit_Abdo_Pain,Bleed_Headache,Antibody_Response,Hospitalized,Multiple_Symptoms,HPR
0,-0.967218,-1.12669,-1.12669,0.922033,1,-0.568254,1.135859,1.045104,1,0,...,9812.0,1,1,0,1,0,2,1,1,0.002
1,-0.677227,-0.913221,-0.913221,0.643591,1,-0.666892,0.86985,0.796406,0,0,...,8861.4,0,0,0,0,0,1,1,1,0.002111
2,-1.25721,-1.553628,-1.553628,1.618137,1,-0.913488,1.800882,1.194323,1,0,...,7103.5,1,1,0,0,0,1,1,1,0.003462


In [21]:
# Create a StandardScaler
from sklearn.preprocessing import StandardScaler
continuous_features = ['age', 'wgt', 'height_(in_meter_)', 'body_mass_index_(bmi)', 'platelet', 'hemato', 'wbc']
scaler = StandardScaler()
df[continuous_features] = scaler.fit_transform(df[continuous_features])

In [27]:
df.drop(columns=
        [
            'Multiple_Symptoms',
            'Hospitalized',
            'Vomit_Abdo_Pain',
            'Fever_Headache',
            'Symptom Count',
            'Age_Group_toddlers',
            'Age_Group_preschoolers',
            'hospi',
            'igg_anti',
            'igm_anti',
            'headache_',
            's_rash'
        ], inplace=True)

In [28]:
df.head(3)

Unnamed: 0,age,wgt,height_(in_meter_),body_mass_index_(bmi),fever,platelet,hemato,wbc,j_pain,abdo_pain,...,sex_F,sex_M,Age_Group_infants,Age_Group_school-age,Blood Health,Fever_JointPain,Fever_SkinRash,Bleed_Headache,Antibody_Response,HPR
0,-0.967218,-1.12669,-1.12669,0.922033,1,-0.568254,1.135859,1.045104,1,1,...,0,1,0,0,9812.0,1,0,0,2,0.002
1,-0.677227,-0.913221,-0.913221,0.643591,1,-0.666892,0.86985,0.796406,0,0,...,0,1,0,0,8861.4,0,0,0,1,0.002111
2,-1.25721,-1.553628,-1.553628,1.618137,1,-0.913488,1.800882,1.194323,1,0,...,1,0,1,0,7103.5,1,0,0,1,0.003462


In [29]:
# Data Splitting:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

# Define the features (X) and target variable (y)
X = df.drop(columns=['target'])  # Replace 'target' with your target column name
y = df['target']

# Xtrain, Xtest, ytrain, ytest = train_test_split(X[:3000], y[:3000], test_size=0.10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
# random_model = RandomForestRegressor(n_estimators=300, random_state = 42, n_jobs = -1)

random_model = RandomForestClassifier(n_estimators=300, random_state = 42, n_jobs = -1)

#Fit
random_model.fit(X_train,  y_train)
y_pred = random_model.predict(X_test)

#Checking the accuracy
random_model_accuracy = round(random_model.score(X_train, y_train)*100,2)
print(round(random_model_accuracy, 2), '%')

100.0 %


In [32]:
#Checking the accuracy
random_model_accuracy = round(random_model.score(X_test, y_test)*100,2)
print(round(random_model_accuracy, 2), '%')

96.0 %
