# INF367A Applied Machine Learning - Group project

* Elias Hovdenes
* Isak Pall Gestsson
* Magnus Sponnich Brørby

## Exploring Mental Health Data - Competition

### Exploring the dataset:

In [2]:

import matplotlib.pyplot as plt
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split,HalvingGridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler , OneHotEncoder
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

import torch.nn as nn
import dynamic_selection as ds
import torch.nn.functional as F

import numpy as np
import pandas as pd
from scipy import sparse
from dynamic_selection import MaskingPretrainer, GreedyDynamicSelection
from torch.distributions import Distribution
Distribution.set_default_validate_args(False)
#It seems there is an edge case roundoff error where the different values in the tensor sum to a little bit over 1



i prepare data kombineres academic og work pressure til en kolonne pressure. i tillegg finner den kategoriske og numeriske features og lage en preprocessing pipeline for hver av de. for de numeriske verdiene så imputer den missing verdier med gjennomsnittet av den aktuelle kolonnen og så scaler dataene. For de kategoriske verdiene så imputer den de som intreffer mest og så one hot encoder de.

In [3]:


def prepare_data(dataset,preprocessor = None):

    X = pd.DataFrame(dataset)
     

    X = dataset.drop('Depression', axis=1, errors='ignore')

    if 'Depression' in dataset.columns:
        y = dataset['Depression']
    else:
        y = None

    # setter 0 for missing verdier og så summer sammen A og W pressure i en pressure kolonne
    X.fillna({'Academic Pressure': 0}, inplace=True)
    X.fillna({'Work Pressure': 0}, inplace=True)

    X['Pressure'] = X['Academic Pressure'] + X['Work Pressure']

    # dropper 
    X.drop(['Academic Pressure', 'Work Pressure'], axis=1, inplace=True)
    X.drop(['id'],axis=1,inplace=True)

    
    #name_counts = X['Name'].value_counts().to_dict()
    #X['NameCount'] = X['Name'].map(name_counts)


    categorical_features = ['Profession', 'Degree','Name','City','Gender', 'Working Professional or Student', 'Sleep Duration', 'Dietary Habits', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
    numerical_features = X.drop(categorical_features, axis=1).columns

    if preprocessor is None:
       
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))])
        
        # Kombiner preprocessings.
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)])
        
        # Tilpass og transformer X
        X_preprocessed = preprocessor.fit_transform(X)
    else:
        # Transform data basert på eksisterende preprocessor
        X_preprocessed = preprocessor.transform(X)

   

    return X_preprocessed, y, preprocessor




    

In [4]:



file_path = "Data/train.csv"
file_path_test = "Data/test.csv"

trainingSet = pd.read_csv(file_path)
X_test = pd.read_csv(file_path_test)

test_ids = X_test['id']
  
df_train, df_val = train_test_split(trainingSet, test_size=0.2, random_state=42)

# Preprocess treningsdata
X_train_preprocessed, y_train, preprocessor = prepare_data(df_train)

# Preprocess valideringsdata
X_val_preprocessed, y_val, _ = prepare_data(df_val, preprocessor=preprocessor)

# Preprocess testdata
X_test_preprocessed, _, _ = prepare_data(X_test, preprocessor=preprocessor)



input_size = X_train_preprocessed.shape[1]
print(f'Input size er: {input_size}')
print(X_train_preprocessed)





Input size er: 709
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1891157 stored elements and shape (112560, 709)>
  Coords	Values
  (0, 0)	1.1781079788457047
  (0, 1)	-1.3608581883143087e-15
  (0, 2)	2.200024066726753e-15
  (0, 3)	1.601307257907144
  (0, 4)	-0.584280048908395
  (0, 5)	-0.7004473126135766
  (0, 6)	-1.4444817551693707
  (0, 58)	1.0
  (0, 84)	1.0
  (0, 444)	1.0
  (0, 564)	1.0
  (0, 648)	1.0
  (0, 650)	1.0
  (0, 679)	1.0
  (0, 702)	1.0
  (0, 705)	1.0
  (0, 708)	1.0
  (1, 0)	1.0973739427544427
  (1, 1)	-1.3608581883143087e-15
  (1, 2)	2.200024066726753e-15
  (1, 3)	0.8116324006532557
  (1, 4)	1.2319167655524366
  (1, 5)	0.7169565746358945
  (1, 6)	1.4069890684340032
  (1, 58)	1.0
  :	:
  (112558, 458)	1.0
  (112558, 627)	1.0
  (112558, 647)	1.0
  (112558, 650)	1.0
  (112558, 669)	1.0
  (112558, 702)	1.0
  (112558, 705)	1.0
  (112558, 707)	1.0
  (112559, 0)	1.258842014936967
  (112559, 1)	-1.3608581883143087e-15
  (112559, 2)	2.200024066726753e-15
  (112559, 

Data Preprocessing

Modell

In [5]:
model = RandomForestClassifier(n_estimators= 100, criterion="gini", max_depth= 40, max_features="sqrt",random_state=42)
model.fit(X_train_preprocessed,y_train)

y_prediction = model.predict(X_val_preprocessed)


print(accuracy_score(y_val,y_prediction))



y_test_predict = model.predict(X_test_preprocessed)


#output = pd.DataFrame({'id': processed_test["id"], 'Depression': y_test_predict})
#output.to_csv('test_predictions.csv', index=False)


KeyboardInterrupt: 

Hyperparameter tuning RF


In [5]:
param_grid = {
    'n_estimators': [20,50,100,200],
    'criterion': ['entropy', 'gini'],
    'max_depth': [5, 10, 20 , 40],
    'max_features': ["sqrt", "log2"],
}


half_grid_search = HalvingGridSearchCV(estimator=model,param_grid=param_grid, factor=3, scoring='accuracy',cv=5)

half_grid_search.fit(X_train_preprocessed,y_train)

print(f"Beste parametere: {half_grid_search.best_params_}")


val_accuracy = half_grid_search.score(X_val_preprocessed,y_val)

print(f"Nøyaktighet på validation settet: {val_accuracy}")

KeyboardInterrupt: 

Konvertere til Tensorer

In [6]:

X_train_preprocessed = X_train_preprocessed.toarray() if sparse.issparse(X_train_preprocessed) else X_train_preprocessed
X_val_preprocessed = X_val_preprocessed.toarray() if sparse.issparse(X_val_preprocessed) else X_val_preprocessed
X_test_preprocessed = X_test_preprocessed.toarray() if sparse.issparse(X_test_preprocessed) else X_test_preprocessed



X_train_tensor = torch.tensor(X_train_preprocessed, dtype=torch.float32)
train_mean = X_train_tensor.mean(dim=0)
X_train_tensor = (X_train_tensor - train_mean)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.long).view(-1, 1)
y_train_tensor_1D = y_train_tensor.squeeze()


X_val_tensor = torch.tensor(X_val_preprocessed, dtype=torch.float32)
val_mean = X_val_tensor.mean(dim=0)
X_val_tensor = (X_val_tensor - val_mean)

y_val_tensor = torch.tensor(y_val.values, dtype=torch.long).view(-1, 1)
y_val_tensor_1D = y_val_tensor.squeeze()

X_test_tensor = torch.tensor(X_test_preprocessed, dtype=torch.float32)
test_mean = X_test_tensor.mean(dim=0)
X_test_tensor = (X_test_tensor - test_mean)





# Lag DataLoaders
batch_size = 128
train_dataset = TensorDataset(X_train_tensor, y_train_tensor_1D)
print(train_dataset[2])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,drop_last=True)


val_dataset = TensorDataset(X_val_tensor, y_val_tensor_1D)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

(tensor([ 6.1297e-01, -7.7948e-10, -9.5571e-09,  2.1958e-02, -8.4374e-01,
        -1.4091e+00,  6.9412e-01, -3.5537e-05, -9.5238e-03, -8.8842e-06,
        -3.1397e-02, -8.8842e-06, -8.8842e-06, -8.8842e-06, -1.7768e-05,
        -8.8842e-06, -2.2486e-02, -2.0247e-02, -2.1047e-02, -8.8842e-06,
        -1.0483e-02, -3.0277e-02, -5.5499e-02, -1.4677e-02, -1.6933e-02,
        -8.8842e-06, -9.6660e-03, -2.3339e-02, -2.0327e-02, -1.0999e-02,
        -2.1109e-02, -8.8842e-06, -9.1773e-03, -1.0830e-02, -8.2356e-03,
        -2.8616e-02, -2.9229e-03, -1.2154e-02, -8.8842e-06, -1.5965e-02,
        -8.8842e-06, -8.8842e-06, -1.2385e-02, -1.3824e-02, -8.2800e-03,
        -8.8842e-06, -8.8842e-06, -8.8842e-06, -1.7768e-05, -2.7754e-02,
        -1.3726e-02, -1.2385e-02, -1.7768e-05, -4.8241e-03, -1.6596e-02,
        -1.2491e-02, -1.0652e-02, -3.5537e-05,  5.6262e-01, -1.3104e-02,
        -1.0368e-02, -2.6652e-05, -8.8842e-06, -8.8842e-06, -2.6652e-05,
        -8.8842e-06, -8.8842e-06, -8.8842e-06, -8.

The predictor and selector

In [14]:
input_size = 709
output_size = 2 
dropout_rate = 0.4
hidden_size = 64

predicting_net= nn.Sequential(
    nn.Linear(2*input_size,hidden_size),
    nn.ReLU(),
    nn.Dropout(dropout_rate),
    nn.Linear(hidden_size,hidden_size),
    nn.ReLU(),
    nn.Dropout(dropout_rate),
    nn.Linear(hidden_size,output_size))

selecting_net = nn.Sequential(
    nn.Linear(2 * input_size, hidden_size),
    nn.ReLU(),
    nn.Dropout(dropout_rate),
    nn.Linear(hidden_size, hidden_size),
    nn.ReLU(),
    nn.Dropout(dropout_rate),
    nn.Linear(hidden_size, input_size ))

vanilla_net = nn.Sequential(
    nn.Linear(input_size,hidden_size),
    nn.ReLU(),
    nn.Dropout(dropout_rate),
    nn.Linear(hidden_size,hidden_size),
    nn.ReLU(),
    nn.Dropout(dropout_rate),
    nn.Linear(hidden_size,output_size)
)
    


total_params = sum(p.numel() for p in vanilla_net.parameters())
print(f'Totalt antall parametere: {total_params}')



Totalt antall parametere: 49730


Training of vanilla network

In [26]:


model = vanilla_net
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Treningssløyfe
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # skriv average loss
    
    # Utskrift av tap etter hver epoch for å overvåke treningen
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Evaluer modellen på valideringssett
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in val_loader:
        outputs = model(inputs)
        predicted = torch.argmax(outputs, dim=1).float()  # Output form: [batch_size]

        total = labels.size(0)
        correct = (predicted == labels).sum().item()
        
        accuracy = correct / total
    print(f'Valideringssett nøyaktighet: {accuracy * 100:.2f}%')


with torch.no_grad():
    test_predictions = model(X_test_tensor)
    probabilities = torch.sigmoid(test_predictions)

    predicted_classes_vanilla = torch.argmax(test_predictions, dim=1)

output = pd.DataFrame({'id': test_ids, 'Depression': predicted_classes_vanilla.squeeze().int().numpy()})
output.to_csv('NN_prediksjoner.csv', index=False)
print("Testprediksjoner er lagret i submission.csv")


Epoch 1/30, Loss: 0.09229075163602829
Epoch 2/30, Loss: 0.1486276388168335
Epoch 3/30, Loss: 0.10485803335905075
Epoch 4/30, Loss: 0.12454139441251755
Epoch 5/30, Loss: 0.14215493202209473
Epoch 6/30, Loss: 0.07805419713258743
Epoch 7/30, Loss: 0.11141891777515411
Epoch 8/30, Loss: 0.14064852893352509
Epoch 9/30, Loss: 0.1390170156955719
Epoch 10/30, Loss: 0.05337481573224068
Epoch 11/30, Loss: 0.07440768927335739
Epoch 12/30, Loss: 0.11202900856733322
Epoch 13/30, Loss: 0.12533758580684662
Epoch 14/30, Loss: 0.14646399021148682
Epoch 15/30, Loss: 0.10467162728309631
Epoch 16/30, Loss: 0.06769869476556778
Epoch 17/30, Loss: 0.1251729428768158
Epoch 18/30, Loss: 0.15369214117527008
Epoch 19/30, Loss: 0.1674322485923767
Epoch 20/30, Loss: 0.10187596827745438
Epoch 21/30, Loss: 0.08427682518959045
Epoch 22/30, Loss: 0.08991552144289017
Epoch 23/30, Loss: 0.15172234177589417
Epoch 24/30, Loss: 0.060691285878419876
Epoch 25/30, Loss: 0.11675862967967987
Epoch 26/30, Loss: 0.1025868058204650

Pretraining of the predictor

In [8]:
mask_layer = ds.utils.MaskLayer(append=True)
pretrain = MaskingPretrainer(predicting_net, mask_layer)
pretrain.fit(
    train_loader,
    val_loader,
    lr=1e-3,
    nepochs=100,
    loss_fn=nn.CrossEntropyLoss(),
    verbose=True)



--------Epoch 1--------
Val loss = 0.2911

--------Epoch 2--------
Val loss = 0.3117

--------Epoch 3--------
Val loss = 0.2882

--------Epoch 4--------
Val loss = 0.2748

--------Epoch 5--------
Val loss = 0.2788

--------Epoch 6--------
Val loss = 0.2795

--------Epoch 7--------
Val loss = 0.2932

--------Epoch 8--------
Val loss = 0.2754

Stopping early at epoch 8


Joint training

In [9]:
gdfs = GreedyDynamicSelection(selecting_net, predicting_net, mask_layer)
gdfs.fit(
    train_loader,
    val_loader,
    lr=1e-3,
    nepochs=3,
    max_features=35,
    loss_fn=nn.CrossEntropyLoss(),
    verbose=True)
    



Starting training with temp = 1.0000

--------Epoch 1 (1 total)--------
Val loss = 0.1657, Zero-temp loss = 0.1672

--------Epoch 2 (2 total)--------
Val loss = 0.1649, Zero-temp loss = 0.1671

--------Epoch 3 (3 total)--------
Val loss = 0.1674, Zero-temp loss = 0.1699

Stopping temp = 1.0000 at epoch 3

Starting training with temp = 0.5623

--------Epoch 1 (4 total)--------
Val loss = 0.1686, Zero-temp loss = 0.1697

--------Epoch 2 (5 total)--------
Val loss = 0.1683, Zero-temp loss = 0.1692

--------Epoch 3 (6 total)--------
Val loss = 0.1715, Zero-temp loss = 0.1725

Stopping temp = 0.5623 at epoch 3

Starting training with temp = 0.3162

--------Epoch 1 (7 total)--------
Val loss = 0.1710, Zero-temp loss = 0.1713

--------Epoch 2 (8 total)--------
Val loss = 0.1729, Zero-temp loss = 0.1731

--------Epoch 3 (9 total)--------
Val loss = 0.1813, Zero-temp loss = 0.1813

Stopping temp = 0.3162 at epoch 3

Starting training with temp = 0.1778

--------Epoch 1 (10 total)--------
Val lo

Predictions


In [10]:
test_predictions,mask,m   = gdfs.forward(X_test_tensor,max_features =30)

probabilities = torch.sigmoid(test_predictions)

predicted_classes = torch.argmax(test_predictions, dim=1)


binary_predictions = (probabilities >= 0.5).float()

print(binary_predictions)
print(predicted_classes)


tensor([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [0., 1.],
        [1., 0.]])
tensor([0, 0, 0,  ..., 0, 1, 0])


Evaluation

In [None]:
probabilities = torch.sigmoid(test_predictions)

predicted_classes = torch.argmax(test_predictions, dim=1)







tensor([0, 0, 0,  ..., 0, 1, 0])


In [27]:

output = pd.DataFrame({'id': test_ids, 'Depression': predicted_classes.squeeze().int().numpy()})
output.to_csv('Greedy_prediksjoner.csv', index=False)
print("Testprediksjoner er lagret i submission.csv")

Testprediksjoner er lagret i submission.csv
