In [7]:
import json
import numpy as np

# Load the JSON file
json_file = "LLMgenerated_outputs.json"  # Replace with your actual file path
with open(json_file, "r") as file:
    data = json.load(file)


    
labels = []
for image_path, output in data.items():  # Iterate over key-value pairs
    if "notdepressed" in image_path.lower():
        labels.append(0)
    elif "depressed" in image_path.lower():
        labels.append(1)

# Save labels to a NumPy file
labels = np.array(labels)
np.save("labels.npy", labels)
print(f"Labels vector saved with {len(labels)} entries.")


Labels vector saved with 10000 entries.


In [8]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import json

# Load JSON file

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

def get_text_embeddings(caption):
    """
    Generate BERT embeddings for a given caption.
    """
    inputs = tokenizer(caption, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.cpu().numpy()

# Generate text embeddings for all captions in JSON data
text_embeddings = {}
for image_path, caption in data.items():  # Iterate over key-value pairs
    try:
        embedding = get_text_embeddings(caption)
        text_embeddings[image_path] = embedding
    except Exception as e:
        print(f"Error processing {image_path}: {e}")

# Save text embeddings as a NumPy file
np.save("llm_text_embeddings_10k.npy", text_embeddings)
print(f"Text embeddings saved for {len(text_embeddings)} items.")





Text embeddings saved for 10000 items.


In [10]:
import numpy as np
import json

# Load embeddings and labels
text_embeddings_file = "text_embeddings.npy"  # BERT-generated text embeddings
image_embeddings_file = "image_embeddings.npy"  # Image embeddings
llm_text_embeddings_file = "llm_text_embeddings_10k.npy"  # LLM-generated text embeddings
labels_file = "labels.npy"  # Labels

# Load data
text_embeddings = np.load(text_embeddings_file, allow_pickle=True).item()
image_embeddings = np.load(image_embeddings_file, allow_pickle=True).item()
llm_text_embeddings = np.load(llm_text_embeddings_file, allow_pickle=True).item()
labels = np.load(labels_file)

# Load JSON to get the image paths
json_file = "LLMgenerated_outputs.json"  # Replace with your actual file path

with open(json_file, "r") as file:
    data = json.load(file)

# Initialize feature matrix and labels
feature_matrix = []
final_labels = []

# Iterate over all data and concatenate embeddings
for idx, (image_path, llm_caption) in enumerate(data.items()):
    if image_path in text_embeddings and image_path in image_embeddings and image_path in llm_text_embeddings:
        # Concatenate all embeddings into a single vector
        combined_vector = np.concatenate((
            text_embeddings[image_path].flatten(),  # Flatten BERT text embedding
            image_embeddings[image_path].flatten(),  # Flatten image embedding
            llm_text_embeddings[image_path].flatten()  # Flatten LLM text embedding
        ))
        feature_matrix.append(combined_vector)
        final_labels.append(labels[idx])  # Ensure labels are aligned with the features

# Convert feature matrix and labels to NumPy arrays
feature_matrix = np.array(feature_matrix)
final_labels = np.array(final_labels)

# Save the feature matrix and labels for future use
np.save("feature_matrix.npy", feature_matrix)
np.save("final_labels.npy", final_labels)

print(f"Feature matrix shape: {feature_matrix.shape}")
print(f"Labels shape: {final_labels.shape}")


Feature matrix shape: (10000, 2304)
Labels shape: (10000,)


In [2]:
!python -m pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/32/93/66826e2f50cefecbb0a44bd1e667316bf0a3c8e78cd1f0cdf52f5b2c5c6f/xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata
  Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.3


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# Load feature matrix and labels
feature_matrix = np.load("feature_matrix.npy")
final_labels = np.load("final_labels.npy")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, final_labels, test_size=0.2, random_state=42, stratify=final_labels
)

# Standardize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Function to evaluate classifiers
def evaluate_model(model, X_train, X_test, y_train, y_test):
    """
    Train the model and evaluate it on the test set.
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else "N/A",
        "Confusion Matrix": confusion_matrix(y_test, y_pred),
    }
    return metrics

# List of classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine (SVM)": SVC(probability=True),
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
}

# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    print(f"Training {name}...")
    metrics = evaluate_model(clf, X_train, X_test, y_train, y_test)
    results[name] = metrics
    print(f"Results for {name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")
    print()

# Print a summary of all results
print("Summary of Classifier Performance:")
for name, metrics in results.items():
    print(f"{name}: Accuracy = {metrics['Accuracy']:.2f}, F1 Score = {metrics['F1 Score']:.2f}, ROC AUC = {metrics['ROC AUC']}")


Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Results for Logistic Regression:
  Accuracy: 0.578
  Precision: 0.47619047619047616
  Recall: 0.44554455445544555
  F1 Score: 0.46035805626598464
  ROC AUC: 0.5735498413515849
  Confusion Matrix: [[796 396]
 [448 360]]

Training Random Forest...
Results for Random Forest:
  Accuracy: 0.617
  Precision: 0.5921052631578947
  Recall: 0.1670792079207921
  F1 Score: 0.26061776061776065
  ROC AUC: 0.5987425451857266
  Confusion Matrix: [[1099   93]
 [ 673  135]]

Training Support Vector Machine (SVM)...
Results for Support Vector Machine (SVM):
  Accuracy: 0.6275
  Precision: 0.6046511627906976
  Recall: 0.22524752475247525
  F1 Score: 0.32822362488728585
  ROC AUC: 0.6351257766296763
  Confusion Matrix: [[1073  119]
 [ 626  182]]

Training K-Nearest Neighbors (KNN)...
Results for K-Nearest Neighbors (KNN):
  Accuracy: 0.5865
  Precision: 0.48671328671328673
  Recall: 0.4306930693069307
  F1 Score: 0.45699277741300065
  ROC AUC: 0.5891218893281945
  Confusion Matrix: [[825 367]
 [460 348]]



Parameters: { "use_label_encoder" } are not used.



Results for XGBoost:
  Accuracy: 0.6005
  Precision: 0.5080500894454383
  Recall: 0.35148514851485146
  F1 Score: 0.415508412582297
  ROC AUC: 0.6041997184198284
  Confusion Matrix: [[917 275]
 [524 284]]

Summary of Classifier Performance:
Logistic Regression: Accuracy = 0.58, F1 Score = 0.46, ROC AUC = 0.5735498413515849
Random Forest: Accuracy = 0.62, F1 Score = 0.26, ROC AUC = 0.5987425451857266
Support Vector Machine (SVM): Accuracy = 0.63, F1 Score = 0.33, ROC AUC = 0.6351257766296763
K-Nearest Neighbors (KNN): Accuracy = 0.59, F1 Score = 0.46, ROC AUC = 0.5891218893281945
Decision Tree: Accuracy = 0.53, F1 Score = 0.43, ROC AUC = 0.5134404694664096
Gradient Boosting: Accuracy = 0.61, F1 Score = 0.30, ROC AUC = 0.5962548383613528
XGBoost: Accuracy = 0.60, F1 Score = 0.42, ROC AUC = 0.6041997184198284


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
# Load feature matrix and labels
feature_matrix = np.load("feature_matrix.npy")
final_labels = np.load("final_labels.npy")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, final_labels, test_size=0.2, random_state=42, stratify=final_labels
)

# Standardize the feature matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define hyperparameter grid
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize the model
knn = KNeighborsClassifier()

# Perform Grid Search
grid_search_knn = GridSearchCV(knn, knn_param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)
grid_search_knn.fit(X_train, y_train)

# Best parameters and score
print(f"Best Parameters for KNN: {grid_search_knn.best_params_}")
print(f"Best F1 Score for KNN: {grid_search_knn.best_score_}")


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters for KNN: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
Best F1 Score for KNN: 0.45387862730444517


In [5]:
from sklearn.linear_model import LogisticRegression

# Define hyperparameter grid
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

# Initialize the model
logistic_regression = LogisticRegression(max_iter=500)

# Perform Grid Search
grid_search_lr = GridSearchCV(logistic_regression, lr_param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)
grid_search_lr.fit(X_train, y_train)

# Best parameters and score
print(f"Best Parameters for Logistic Regression: {grid_search_lr.best_params_}")
print(f"Best F1 Score for Logistic Regression: {grid_search_lr.best_score_}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters for Logistic Regression: {'C': 0.01, 'solver': 'liblinear'}
Best F1 Score for Logistic Regression: 0.462310016807717


In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
dt_param_grid = {
    'criterion': ['gini', 'entropy'],             # Split quality measure
    'max_depth': [None, 10, 20, 30],              # Depth of the tree
    'max_features': [None, 'sqrt', 'log2'],       # Max features for a split
}

# Initialize Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)

# Perform Grid Search
grid_search_dt = GridSearchCV(
    dt, dt_param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1
)
grid_search_dt.fit(X_train, y_train)

# Print the best parameters and best F1 score
print(f"Best Parameters for Decision Tree: {grid_search_dt.best_params_}")
print(f"Best F1 Score for Decision Tree: {grid_search_dt.best_score_}")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters for Decision Tree: {'criterion': 'gini', 'max_depth': None, 'max_features': None}
Best F1 Score for Decision Tree: 0.4498924334931441
