# INF367A Applied Machine Learning - Group project

* Elias Hovdenes
* Isak Pall Gestsson
* Magnus Sponnich Brørby

## Exploring Mental Health Data - Competition

### Exploring the dataset:

In [1]:
import matplotlib.pyplot as plt
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split,HalvingGridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import numpy as np
import pandas as pd
from scipy import sparse



i prepare data kombineres academic og work pressure til en kolonne pressure. i tillegg finner den kategoriske og numeriske features og lage en preprocessing pipeline for hver av de. for de numeriske verdiene så imputer den missing verdier med gjennomsnittet av den aktuelle kolonnen og så scaler dataene. For de kategoriske verdiene så imputer den de som intreffer mest og så one hot encoder de.

In [2]:


def prepare_data(dataset,preprocessor = None):

    X = pd.DataFrame(dataset)
     

    X = dataset.drop('Depression', axis=1, errors='ignore')

    if 'Depression' in dataset.columns:
        y = dataset['Depression']
    else:
        y = None

    # setter 0 for missing verdier og så summer sammen A og W pressure i en pressure kolonne
    X.fillna({'Academic Pressure': 0}, inplace=True)
    X.fillna({'Work Pressure': 0}, inplace=True)

    X['Pressure'] = X['Academic Pressure'] + X['Work Pressure']

    # dropper 
    X.drop(['Academic Pressure', 'Work Pressure'], axis=1, inplace=True)
    X.drop(['id'],axis=1,inplace=True)

    
    #name_counts = X['Name'].value_counts().to_dict()
    #X['NameCount'] = X['Name'].map(name_counts)


    categorical_features = ['Profession', 'Degree','Name','City','Gender', 'Working Professional or Student', 'Sleep Duration', 'Dietary Habits', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
    numerical_features = X.drop(categorical_features, axis=1).columns

    if preprocessor is None:
       
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))])
        
        # Kombiner preprocessings.
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)])
        
        # Tilpass og transformer X
        X_preprocessed = preprocessor.fit_transform(X)
    else:
        # Transform data basert på eksisterende preprocessor
        X_preprocessed = preprocessor.transform(X)

    return X_preprocessed, y, preprocessor




    

In [3]:



file_path = "Data/train.csv"
file_path_test = "Data/test.csv"

trainingSet = pd.read_csv(file_path)
X_test = pd.read_csv(file_path_test)

test_ids = X_test['id']
  
df_train, df_val = train_test_split(trainingSet, test_size=0.2, random_state=42)

# Preprocess treningsdata
X_train_preprocessed, y_train, preprocessor = prepare_data(df_train)

# Preprocess valideringsdata
X_val_preprocessed, y_val, _ = prepare_data(df_val, preprocessor=preprocessor)

# Preprocess testdata
X_test_preprocessed, _, _ = prepare_data(X_test, preprocessor=preprocessor)

input_size = X_train_preprocessed.shape[1]
print(f'Input size er: {input_size}')
print(X_train_preprocessed)





Input size er: 709
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1891157 stored elements and shape (112560, 709)>
  Coords	Values
  (0, 0)	1.1781079788457047
  (0, 1)	-1.3608581883143087e-15
  (0, 2)	2.200024066726753e-15
  (0, 3)	1.601307257907144
  (0, 4)	-0.584280048908395
  (0, 5)	-0.7004473126135766
  (0, 6)	-1.4444817551693707
  (0, 58)	1.0
  (0, 84)	1.0
  (0, 444)	1.0
  (0, 564)	1.0
  (0, 648)	1.0
  (0, 650)	1.0
  (0, 679)	1.0
  (0, 702)	1.0
  (0, 705)	1.0
  (0, 708)	1.0
  (1, 0)	1.0973739427544427
  (1, 1)	-1.3608581883143087e-15
  (1, 2)	2.200024066726753e-15
  (1, 3)	0.8116324006532557
  (1, 4)	1.2319167655524366
  (1, 5)	0.7169565746358945
  (1, 6)	1.4069890684340032
  (1, 58)	1.0
  :	:
  (112558, 458)	1.0
  (112558, 627)	1.0
  (112558, 647)	1.0
  (112558, 650)	1.0
  (112558, 669)	1.0
  (112558, 702)	1.0
  (112558, 705)	1.0
  (112558, 707)	1.0
  (112559, 0)	1.258842014936967
  (112559, 1)	-1.3608581883143087e-15
  (112559, 2)	2.200024066726753e-15
  (112559, 

Data Preprocessing

Modell

In [4]:
model = RandomForestClassifier(n_estimators= 100, criterion="gini", max_depth= 40, max_features="sqrt",random_state=42)
model.fit(X_train_preprocessed,y_train)

y_prediction = model.predict(X_val_preprocessed)


print(accuracy_score(y_val,y_prediction))



y_test_predict = model.predict(X_test_preprocessed)


#output = pd.DataFrame({'id': processed_test["id"], 'Depression': y_test_predict})
#output.to_csv('test_predictions.csv', index=False)


0.9336176261549396


Hyperparameter tuning RF


In [5]:
param_grid = {
    'n_estimators': [20,50,100],
    'criterion': ['entropy', 'gini'],
    'max_depth': [5, 10, 20 , 40],
    'max_features': ["sqrt", "log2"],
}


half_grid_search = HalvingGridSearchCV(estimator=model,param_grid=param_grid, factor=3, scoring='accuracy',cv=5)

half_grid_search.fit(X_train_preprocessed,y_train)

print(f"Beste parametere: {half_grid_search.best_params_}")


val_accuracy = half_grid_search.score(X_val_preprocessed,y_val)

print(f"Nøyaktighet på validation settet: {val_accuracy}")

Beste parametere: {'criterion': 'entropy', 'max_depth': 40, 'max_features': 'sqrt', 'n_estimators': 100}
Nøyaktighet på validation settet: 0.9330845771144278


Neural Network


In [6]:
input_size = 709

class DepressionDetector(nn.Module):
    #The __init__ method initializes the neural network layers and activation functions.
    #nn.Module is initialized using super().__init__() to inherit its functionalities.
    def __init__(self):
        super().__init__()  
        self.layer_1 = nn.Linear(input_size, 64)
        self.layer_2 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = torch.relu(self.layer_1(x))
        out = torch.relu(self.layer_2(out))
        out = self.sigmoid(self.output(out))
        return out    

net = DepressionDetector()


Konvertere til Tensorer

In [7]:

X_train_preprocessed = X_train_preprocessed.toarray() if sparse.issparse(X_train_preprocessed) else X_train_preprocessed
X_val_preprocessed = X_val_preprocessed.toarray() if sparse.issparse(X_val_preprocessed) else X_val_preprocessed
X_test_preprocessed = X_test_preprocessed.toarray() if sparse.issparse(X_test_preprocessed) else X_test_preprocessed


X_train_tensor = torch.tensor(X_train_preprocessed, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val_preprocessed, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_preprocessed, dtype=torch.float32)

# Lag DataLoaders
batch_size = 100
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
input_size = X_train_tensor.shape[1]
model = DepressionDetector()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Treningssløyfe
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # skriv average loss
    
    # Utskrift av tap etter hver epoch for å overvåke treningen
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Evaluer modellen på valideringssett
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in val_loader:
        outputs = model(inputs)
        predicted = (outputs > 0.5).float()  # Konverter til 0 eller 1 basert på terskelverdien 0.5
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracy = correct / total
    print(f'Valideringssett nøyaktighet: {accuracy * 100:.2f}%')


with torch.no_grad():
    test_predictions = model(X_test_tensor)
    test_predictions = (test_predictions > 0.5).float()


Epoch 1/30, Loss: 0.07248366624116898
Epoch 2/30, Loss: 0.2724033296108246
Epoch 3/30, Loss: 0.2520516812801361
Epoch 4/30, Loss: 0.13253477215766907
Epoch 5/30, Loss: 0.14240612089633942
Epoch 6/30, Loss: 0.12472046166658401
Epoch 7/30, Loss: 0.05775958672165871
Epoch 8/30, Loss: 0.08903398364782333
Epoch 9/30, Loss: 0.10771321505308151
Epoch 10/30, Loss: 0.17584064602851868
Epoch 11/30, Loss: 0.12663890421390533
Epoch 12/30, Loss: 0.2351321130990982
Epoch 13/30, Loss: 0.13412968814373016
Epoch 14/30, Loss: 0.05960649624466896
Epoch 15/30, Loss: 0.07568136602640152
Epoch 16/30, Loss: 0.12540262937545776
Epoch 17/30, Loss: 0.2189619094133377
Epoch 18/30, Loss: 0.06711052358150482
Epoch 19/30, Loss: 0.06748362630605698
Epoch 20/30, Loss: 0.07683656364679337
Epoch 21/30, Loss: 0.15260204672813416
Epoch 22/30, Loss: 0.05335124209523201
Epoch 23/30, Loss: 0.06429307162761688
Epoch 24/30, Loss: 0.06694560497999191
Epoch 25/30, Loss: 0.1359492987394333
Epoch 26/30, Loss: 0.05797705426812172


In [9]:

output = pd.DataFrame({'id': test_ids, 'Depression': test_predictions.squeeze().int().numpy()})
output.to_csv('NN_prediksjoner.csv', index=False)
print("Testprediksjoner er lagret i submission.csv")

Testprediksjoner er lagret i submission.csv
