### Loading packages and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures
from sklearn.dummy import DummyRegressor

import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
columns = [
    # year
    # demographics
    'age',
    'gender',
    # 'sex_birth',
    # 'sexual_pref',
    'race',
    # background
    'fincur',
    'gpa_sr',
    'dependents',
    'alc_any',
    # other indicators
    'exerc',
    # 'degree',
    # educ parents
    'educ_par1',
    'educ_par2',
    # target variable is last
    'flourish',
    'anx_score',
    'deprawsc',
    ]

features = columns[:-5] + ['educ_par_max']
cat_features = ['gender','race']
monotone_features = [feat for feat in features if feat not in cat_features]
targets = ['flourish','anx_score','deprawsc']
targets_names = ['Flourish score','Anxiety score','Depression score']

dict_race = {1 : 'black' , 2 : 'ainaan' , 3 : 'asian' , 4 : 'his' , 5 : 'pi' , 6 : 'mides' , 7 : 'white'}
dict_gender = {1: 'female', 2: 'male', 3: 'trans', 4: 'non_binary'}

df_train = pd.read_csv('../../data/processed/all_data_train.csv', usecols = columns)[columns]
df_train = pd.concat([df_train , pd.read_csv('../../data/processed/all_data_additional_train.csv', usecols = columns)[columns]])
df_test = pd.read_csv('../../data/processed/all_data_validation.csv', usecols = columns)[columns]

df_train['educ_par_max'] = df_train[['educ_par1', 'educ_par2']].max(axis=1)
df_train.drop(columns=['educ_par1', 'educ_par2'])
df_train['race'] = df_train['race'].replace(dict_race)
df_train['gender'] = df_train['gender'].replace(dict_gender)

df_test['educ_par_max'] = df_test[['educ_par1', 'educ_par2']].max(axis=1)
df_test.drop(columns=['educ_par1', 'educ_par2'])
df_test['race'] = df_test['race'].replace(dict_race)
df_test['gender'] = df_test['gender'].replace(dict_gender)

def var_ols(feat):
    if feat in cat_features:
        return f'C({feat})'
    else:
        return feat

### Baseline: dummy regressor

In [None]:
# Dummy regressor
for target in targets:
    dummy_regr = DummyRegressor(strategy="mean")
    mask_train = df_train[target].notna()
    X_train = df_train[mask_train][features]
    y_train = df_train[mask_train][target]
    dummy_regr.fit(X_train , y_train)
    mask_test = df_test[target].notna()
    X_test = df_test[mask_test][features]
    y_test = df_test[mask_test][target]
    dummy_pred = dummy_regr.predict(X_test)
    dummy_mse = mse(y_test,dummy_pred)
    print(f'MSE of dummy model for {target}:\t{dummy_mse:.4f}')


### Simple linear regression

In [None]:
for target in targets:
    model_specification = f'{target} ~ {' + '.join([var_ols(feat) for feat in features])}'
    ols_model = ols(model_specification, data = df_train[df_train[target].notna()]).fit()
    ols_pred = ols_model.predict(df_test[df_test[target].notna()][features])
    mask = ols_pred.notna()
    y_val = df_test[df_test[target].notna()][target]
    ols_mse = mse(y_val[mask],ols_pred[mask])
    print(f'MSE of ols model for {target}:\t{ols_mse:.4f}')

In [None]:
for target in targets:
    threshold = df_train[target].median()
    model_specification = f'{target} ~ {' + '.join([var_ols(feat) for feat in features])}'
    ols_model = ols(model_specification, data = df_train[df_train[target].notna()]).fit()
    ols_pred = ols_model.predict(df_test[df_test[target].notna()][features])
    mask = ols_pred.notna()
    y_val = df_test[df_test[target].notna()][target].copy()

    ols_pred = ols_pred[mask]
    y_val = y_val[mask]

    if target == 'flourish':
        preds_unwell = (ols_pred <= threshold).astype(int)
        y_val_unwell = (y_val <= threshold).astype(int)
    else:
        preds_unwell = (ols_pred > threshold).astype(int)
        y_val_unwell = (y_val > threshold).astype(int)

    cm = confusion_matrix(y_val_unwell, preds_unwell)
    

    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    TP = cm[1, 1]

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = 2 * (precision * recall) / (precision + recall)

    # # Create a display object
    cm_display = ConfusionMatrixDisplay(confusion_matrix=cm)

    # # Plot the matrix
    cm_display.plot(cmap=plt.cm.Blues)
    plt.title(f"Confusion Matrix for {target}")
    plt.show()

    print(f'[{accuracy:.4f},{precision:.4f},{recall:.4f},{f1:.4f}]')