In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
data = pd.read_csv('asthma_disease_data.csv')
data.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,DoctorInCharge,Eczema,HayFever,GastroesophagealReflux,LungFunctionFEV1,LungFunctionFVC,Wheezing,ChestTightness,NighttimeSymptoms,ExerciseInduced
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,...,Dr_Confid,0,0,0,1.369051,4.941206,0,1,0,1
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,...,Dr_Confid,0,0,0,2.197767,1.702393,1,0,1,1
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,...,Dr_Confid,0,1,0,1.698011,5.022553,1,1,1,1
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,...,Dr_Confid,0,1,0,3.032037,2.300159,1,1,1,0
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,...,Dr_Confid,0,1,0,3.470589,3.067944,1,1,0,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   PatientID               2392 non-null   int64  
 1   Age                     2392 non-null   int64  
 2   Gender                  2392 non-null   int64  
 3   Ethnicity               2392 non-null   int64  
 4   EducationLevel          2392 non-null   int64  
 5   BMI                     2392 non-null   float64
 6   Smoking                 2392 non-null   int64  
 7   PhysicalActivity        2392 non-null   float64
 8   DietQuality             2392 non-null   float64
 9   SleepQuality            2392 non-null   float64
 10  PollutionExposure       2392 non-null   float64
 11  PollenExposure          2392 non-null   float64
 12  DustExposure            2392 non-null   float64
 13  PetAllergy              2392 non-null   int64  
 14  FamilyHistoryAsthma     2392 non-null   

In [None]:
data = data.dropna()
print(data.columns)

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'PollutionExposure', 'PollenExposure', 'DustExposure', 'PetAllergy',
       'FamilyHistoryAsthma', 'HistoryOfAllergies', 'ShortnessOfBreath',
       'Coughing', 'Diagnosis', 'DoctorInCharge', 'Eczema', 'HayFever',
       'GastroesophagealReflux', 'LungFunctionFEV1', 'LungFunctionFVC',
       'Wheezing', 'ChestTightness', 'NighttimeSymptoms', 'ExerciseInduced'],
      dtype='object')


In [None]:
target_column = 'Diagnosis'
if data[target_column].dtype == 'object':
    data[target_column] = data[target_column].astype('category').cat.codes
X = data.drop(columns=[target_column])
y = data[target_column]

In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns

ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'
)

X_transformed = ct.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)
regressor = DecisionTreeRegressor(random_state = 42)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(mean_absolute_error(y_test,y_pred))

Mean Squared Error: 0.05219206680584551
0.05219206680584551


In [None]:
data.describe()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,Diagnosis,Eczema,HayFever,GastroesophagealReflux,LungFunctionFEV1,LungFunctionFVC,Wheezing,ChestTightness,NighttimeSymptoms,ExerciseInduced
count,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,...,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0
mean,6229.5,42.13796,0.493311,0.669732,1.307274,27.244877,0.141722,5.051786,5.022867,7.019012,...,0.051839,0.19189,0.253344,0.158027,2.548564,3.74127,0.596154,0.503344,0.602425,0.604933
std,690.655244,21.606655,0.50006,0.98612,0.898242,7.201628,0.348838,2.903574,2.90998,1.732475,...,0.221749,0.393869,0.435017,0.364842,0.861809,1.303689,0.49077,0.500093,0.489499,0.488967
min,5034.0,5.0,0.0,0.0,0.0,15.031803,0.0,0.00174,0.003031,4.001437,...,0.0,0.0,0.0,0.0,1.000459,1.500045,0.0,0.0,0.0,0.0
25%,5631.75,23.0,0.0,0.0,1.0,20.968313,0.0,2.578333,2.432043,5.4985,...,0.0,0.0,0.0,0.0,1.824113,2.607489,0.0,0.0,0.0,0.0
50%,6229.5,42.0,0.0,0.0,1.0,27.052202,0.0,5.016881,5.115383,6.975839,...,0.0,0.0,0.0,0.0,2.553244,3.734982,1.0,1.0,1.0,1.0
75%,6827.25,61.0,1.0,1.0,2.0,33.555903,0.0,7.540234,7.544216,8.52695,...,0.0,0.0,1.0,0.0,3.292897,4.864121,1.0,1.0,1.0,1.0
max,7425.0,79.0,1.0,3.0,3.0,39.985611,1.0,9.995809,9.999904,9.996235,...,1.0,1.0,1.0,1.0,3.999719,5.999421,1.0,1.0,1.0,1.0


In [None]:
data.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'PollutionExposure', 'PollenExposure', 'DustExposure', 'PetAllergy',
       'FamilyHistoryAsthma', 'HistoryOfAllergies', 'ShortnessOfBreath',
       'Coughing', 'Diagnosis', 'DoctorInCharge', 'Eczema', 'HayFever',
       'GastroesophagealReflux', 'LungFunctionFEV1', 'LungFunctionFVC',
       'Wheezing', 'ChestTightness', 'NighttimeSymptoms', 'ExerciseInduced'],
      dtype='object')

In [None]:
X.describe()
# data.columns()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,Coughing,Eczema,HayFever,GastroesophagealReflux,LungFunctionFEV1,LungFunctionFVC,Wheezing,ChestTightness,NighttimeSymptoms,ExerciseInduced
count,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,...,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0
mean,6229.5,42.13796,0.493311,0.669732,1.307274,27.244877,0.141722,5.051786,5.022867,7.019012,...,0.503344,0.19189,0.253344,0.158027,2.548564,3.74127,0.596154,0.503344,0.602425,0.604933
std,690.655244,21.606655,0.50006,0.98612,0.898242,7.201628,0.348838,2.903574,2.90998,1.732475,...,0.500093,0.393869,0.435017,0.364842,0.861809,1.303689,0.49077,0.500093,0.489499,0.488967
min,5034.0,5.0,0.0,0.0,0.0,15.031803,0.0,0.00174,0.003031,4.001437,...,0.0,0.0,0.0,0.0,1.000459,1.500045,0.0,0.0,0.0,0.0
25%,5631.75,23.0,0.0,0.0,1.0,20.968313,0.0,2.578333,2.432043,5.4985,...,0.0,0.0,0.0,0.0,1.824113,2.607489,0.0,0.0,0.0,0.0
50%,6229.5,42.0,0.0,0.0,1.0,27.052202,0.0,5.016881,5.115383,6.975839,...,1.0,0.0,0.0,0.0,2.553244,3.734982,1.0,1.0,1.0,1.0
75%,6827.25,61.0,1.0,1.0,2.0,33.555903,0.0,7.540234,7.544216,8.52695,...,1.0,0.0,1.0,0.0,3.292897,4.864121,1.0,1.0,1.0,1.0
max,7425.0,79.0,1.0,3.0,3.0,39.985611,1.0,9.995809,9.999904,9.996235,...,1.0,1.0,1.0,1.0,3.999719,5.999421,1.0,1.0,1.0,1.0


In [None]:
final_model = regressor