<a href="https://colab.research.google.com/github/Davidsb04/disease_risk_detector/blob/main/Disease_risk_from_daily_habits.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Disease risk from daily habits

#### Base de dados utilizada: https://www.kaggle.com/datasets/mahdimashayekhi/disease-risk-from-daily-habits

###🔢 Numerical Features
|Features|Feature	Description|
|------------------|-------------------|
|age|Age of the individual|
|bmi|Body Mass Index|
|blood_pressure|Systolic blood pressure (mm Hg)|
|cholesterol|Cholesterol level (mg/dL)|
|heart_rate|Resting heart rate (bpm)|
|glucose|Blood glucose level|
|insulin|Blood insulin level|
|calorie_intake|Daily average calorie consumption|
|sugar_intake|Daily sugar intake (grams)|
|screen_time|Daily screen time (hours)|
|stress_level|Self-reported stress level (0-10 scale)|
|mental_health_score|Self-reported mental well-being score (0-10 scale)|
|training_hours|Weekly training/exercise hours|


###🧩 Categorical Features
|Features|Feature	Description|
|------------------|-------------------|
|gender|Male / Female|
|marital_status|Single, Married, Divorced, Widowed|
|diet_type|Vegan, Vegetarian, Omnivore, Keto, Paleo|
|occupation|Job type or employment status|
|sleep_quality|Subjective sleep quality|
|mental_health_support|Access to mental health resources|
|exercise_type|None, Cardio, Strength, Mixed|
|device_usage|Device usage level|
|healthcare_access|Ease of access to healthcare|
|insurance|Has health insurance or not|
|family_history|Family history of disease|
|sunlight_exposure|Daily sunlight exposure (Low/Med/High)|
|pet_owner|Owns pets (Yes/No)|
|caffeine_intake|Caffeine consumption level|
|meals_per_day|Number of meals consumed per day|

## Importando e Tratando os Dados

In [None]:
import pandas as pd

In [None]:
base = pd.read_csv('health_lifestyle_classification.csv')

In [None]:
base.head()

Unnamed: 0,survey_code,age,gender,height,weight,bmi,bmi_estimated,bmi_scaled,bmi_corrected,waist_size,...,sunlight_exposure,meals_per_day,caffeine_intake,family_history,pet_owner,electrolyte_level,gene_marker_flag,environmental_risk_score,daily_supplement_dosage,target
0,1,56,Male,173.416872,56.88664,18.915925,18.915925,56.747776,18.989117,72.16513,...,High,5.0,Moderate,No,Yes,0.0,1.0,5.5,-2.275502,healthy
1,2,69,Female,163.20738,97.799859,36.716278,36.716278,110.148833,36.511417,85.598889,...,High,5.0,High,Yes,No,0.0,1.0,5.5,6.23934,healthy
2,3,46,Male,177.281966,80.687562,25.67305,25.67305,77.019151,25.587429,90.29503,...,High,4.0,Moderate,No,No,0.0,1.0,5.5,5.423737,healthy
3,4,32,Female,172.101255,63.142868,21.31848,21.31848,63.95544,21.177109,100.504211,...,High,1.0,,No,Yes,0.0,1.0,5.5,8.388611,healthy
4,5,60,Female,163.608816,40.0,14.943302,14.943302,44.829907,14.844299,69.02115,...,High,1.0,High,Yes,Yes,0.0,1.0,5.5,0.332622,healthy


In [None]:
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41583 entries, 0 to 41582
Data columns (total 48 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   survey_code               41583 non-null  int64  
 1   age                       41583 non-null  int64  
 2   gender                    41583 non-null  object 
 3   height                    41583 non-null  float64
 4   weight                    41583 non-null  float64
 5   bmi                       41583 non-null  float64
 6   bmi_estimated             41583 non-null  float64
 7   bmi_scaled                41583 non-null  float64
 8   bmi_corrected             41583 non-null  float64
 9   waist_size                41583 non-null  float64
 10  blood_pressure            38360 non-null  float64
 11  heart_rate                35719 non-null  float64
 12  cholesterol               41582 non-null  float64
 13  glucose                   41582 non-null  float64
 14  insuli

In [None]:
for i in base.columns[1:48].to_list():
  if base.dtypes[i] == "object":
    base[i] = base[i].astype('category')


In [None]:
categorical_variables = []
for i in base.columns[1:47].to_list():
  if base.dtypes[i] == 'category':
    categorical_variables.append(i)

In [None]:
len(categorical_variables)

17

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
lb = LabelEncoder()
for var in categorical_variables:
  base[var] = lb.fit_transform(base[var])

## Tratando Valores Nulos

In [None]:
def treat_null_values():
  missing_data = base.isna().sum()

  missing_columns = []
  for index, values in missing_data.items():
    if values != 0:
      missing_columns.append(index)


  null_values = 0
  for i in missing_data:
    if i != 0:
      null_values = null_values + i

  rows = base.shape[0]
  perc_missing_values = null_values / rows * 100

  if perc_missing_values < 5:
    base.dropna(inplace=True)
  else:
    for i in missing_columns:
      base[i] = base[i].fillna((base[i].median()))

In [None]:
treat_null_values()

In [None]:
base.isna().sum()

Unnamed: 0,0
survey_code,0
age,0
gender,0
height,0
weight,0
bmi,0
bmi_estimated,0
bmi_scaled,0
bmi_corrected,0
waist_size,0


## Balanceando os Dados para Treino dos Modelos

In [None]:
PREDICTORS = base.iloc[:, 1:47]
TARGET = base.iloc[:, 47]

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
seed = 100

balancer = SMOTE(random_state = seed)

PREDICTORS_RES, TARGET_RES = balancer.fit_resample(PREDICTORS, TARGET)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(PREDICTORS_RES, TARGET_RES, test_size=0.33, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

normalizer = MinMaxScaler()
X_train_standardized = normalizer.fit_transform(X_train)
X_test_standardized = normalizer.transform(X_test)

## Padronizando e Normalizando os Dados

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Treinando e Avaliando Modelo LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(class_weight='balanced',random_state=0)

In [None]:
clf = clf.fit(X_train_scaled, y_train)

In [None]:
y_pred = clf.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.6731555785658815

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[6509, 3260],
       [3053, 6493]])

In [None]:
from sklearn.metrics import recall_score
recall_score(y_test, y_pred, pos_label='diseased')

0.66629132971645

## Treinando e Avaliando Modelo RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=300, criterion='entropy', max_depth=10, max_features='sqrt', min_samples_leaf=2, min_samples_split=2, n_jobs=8)

In [None]:
clf = clf.fit(X_train_standardized, y_train)

In [None]:
y_pred = clf.predict(X_test_scaled)

In [None]:
accuracy_score(y_test, y_pred)

0.6376909137975667

In [None]:
confusion_matrix(y_test, y_pred)

array([[3169, 6600],
       [ 398, 9148]])

In [None]:
recall_score(y_test, y_pred, pos_label='diseased')

0.32439348960999076