In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('bmh')

## EDA

:)

"SBP" typically stands for "systolic blood pressure," which is the highest pressure reached in the arteries during each heartbeat.

"DBP" typically stands for "diastolic blood pressure," which is the lowest pressure reached in the arteries between heartbeats.

"STAI": State-Trait Anxiety Inventory score

"PM2.5": concentration of fine particulate matter (particles with diameter of 2.5 micrometers or less) in the air

In [64]:
%ls

Habitat Stress Mattia.ipynb         habitat_random_forest_model.joblib
Stress Slepp Mattia.ipynb           habitat_water_EDA.ipynb
anxiety.ipynb                       heart_disease.ipynb
habitat_energy_EDA.ipynb


In [None]:
df = pd.read_csv('../datasets/habitat/Habitat_Stress.csv')
df.head()

In [None]:
#Stresslevel takes values from 1 to 4. I though that 1,2 values of "stresslevel" indicated 
#Stress_or_not_after_test == "No", while for 3,4 Stress_or_not_after_test == "Yes"
print("yes:", (df[df["Stresslevel"]>=3]["Stress_or_not_after_test"]=="Yes").sum())
print("no:", (df[df["Stresslevel"]>=3]["Stress_or_not_after_test"]=="No").sum())
#apparently this is not the case, weird but let's forget about Stress_or_not_after_test and focus only on Stresslevel

In [None]:
df.columns[16:].values

In [None]:
df = df.drop(columns=["Setting", "Post", "STAI"])
print(df.columns[15:].values)
df = df.drop(columns= df.columns[15:].values)
df.head()

In [None]:
df.describe()

In [None]:
#make stress level range [0,3] instead of [1,4]. This is done to match the stress sleep dataset
df["Stresslevel"]-=1

### Mapping non numerical variables

sex

In [None]:
df["Sex"].unique()

In [None]:
df["Sex"] = df["Sex"].map({'Male': 1, 'Female': 0})

In [None]:
df["Sex"].unique()

ethnic

In [None]:
df["ethnic"].value_counts()

In [None]:
df = pd.get_dummies(df, columns=['ethnic'])
df.head()

Healthcondition

In [None]:
#won't create dummy variables, instead assign increasing numbers coz we are interested in the order
df["Healthcondition"].unique()

In [None]:
#we need to map strings to numbers
df["Healthcondition"] = df["Healthcondition"].map({'Excellent': 3, 'Very good': 2, 'Good': 1, 'Fair': 0})

In [None]:
df["Healthcondition"].unique()

medicine

In [None]:
df["Medicine"].unique()

In [None]:
df["Medicine"] = df["Medicine"].map({'Yes': 1, 'No': 0})

In [None]:
df["Medicine"].unique()

sleep

In [None]:
df["Sleep"] = df["Sleep"].map({'Yes': 1, 'No': 0})

caffeine beverage

In [None]:
df["Caffinebeverage"] = df["Caffinebeverage"].map({'Yes': 1, 'No': 0})

In [None]:
df.head()

Check for missing values in the dataset


In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df_num = df.iloc[:, list(range(8)) + [13]]
df_num.head()

Visualize the distribution of the variables using histograms or density plots.

In [None]:
df_num.hist(bins=50, figsize=(20, 15))

plt.show()

Visualize the relationship between the continuous variables and the categorical target variable using boxplots or violin plots

In [None]:
# create boxplots for each continuous variable by target variable
df_num.plot(kind='box', subplots=True, layout=(3, 3), sharex=False, sharey=False, figsize=(20, 15), by='Stresslevel')
plt.show()

Check for any correlations between the continuous variables using a correlation matrix heatmap.

In [None]:
corr_matrix = df_num.corr()

fig, ax = plt.subplots(figsize=(12,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
plt.show()

No significant correlations

## Model Building

training and testing (scale features)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

print(df.columns)

df = df.rename(columns={'ethnic_Black or African American': 'ethnic_Black'})
df = df.rename(columns={'ethnic_No response': 'ethnic_No'})
df = df.rename(columns={'ethnic_White/Caucasian': 'ethnic_Caucasian'})



X = df.iloc[:, [n for n in range(20) if n != 13]] #take all variables a part from target (stress) that is at index 13

print(X.columns)

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

y = df.iloc[:, 13]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Random Forest (best model)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy of random forest:", accuracy_rf)

report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)

#### save the model

In [None]:
import joblib

filename = '../backend/machine_learning/model_data/habitat_random_forest_model.joblib'
joblib.dump(rf, filename)

#### load the model

In [None]:
# load the saved model
loaded_model = joblib.load(filename)

# use the loaded model to make predictions
y_pred_loaded = loaded_model.predict(X_test)