In [12]:
pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.7 MB 1.2 MB/s eta 0:00:02
   ------ --------------------------------- 0.3/1.7 MB 1.9 MB/s eta 0:00:01
   --------------- ------------------------ 0.7/1.7 MB 3.6 MB/s eta 0:00:01
   -------------------------- ------------- 1.1/1.7 MB 4.9 MB/s eta 0:00:01
   ---------------------------------------  1.7/1.7 MB 6.1 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 6.1 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.18.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\Berry\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Flatten, concatenate, Lambda

In [2]:
df = pd.read_csv('https://media.githubusercontent.com/media/EthanRosehart/DS2_Assignments/refs/heads/main/final_dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Student_ID           1473 non-null   int64  
 1   Answer               1473 non-null   object 
 2   Age                  1473 non-null   int64  
 3   Gender               1473 non-null   object 
 4   CGPA                 1471 non-null   float64
 5   Stress_Level         1473 non-null   int64  
 6   Depression_Score     1473 non-null   int64  
 7   Anxiety_Score        1473 non-null   int64  
 8   Sleep_Quality        1473 non-null   object 
 9   Physical_Activity    1473 non-null   object 
 10  Diet_Quality         1473 non-null   object 
 11  Relationship_Status  1473 non-null   object 
 12  Financial_Stress     1473 non-null   int64  
 13  Negative_Emotion     1473 non-null   bool   
 14  Absences             1473 non-null   int64  
 15  Lates                1473 non-null   i

# First Model Test
* Removed Anxiety Score, Depression Score and Negative emotion (used to create the dataset)
* NLP to process the journal and Neural Network to process the rest
* Predicting Binary for At Risk

# LSTM + Neural Network

In [None]:
# Exclude Anxiety_Score, Depression_Score, and Negative_Emotion to create df_test
df_test = df.drop(columns=['Anxiety_Score', 'Depression_Score', 'Negative_Emotion'])

# Define input features and target
text_data = df['Answer']
structured_data = df.drop(columns=['Answer', 'Anxiety_Score', 'Depression_Score', 'Negative_Emotion', 'At_Risk'])
target = df['At_Risk']

# Split into train and test sets
X_text_train, X_text_test, X_struct_train, X_struct_test, y_train, y_test = train_test_split(
    text_data, structured_data, target, test_size=0.2, random_state=42
)

# Preprocessing for structured data
scaler = StandardScaler()
X_struct_train_scaled = scaler.fit_transform(X_struct_train.select_dtypes(include=['float64', 'int64']))
X_struct_test_scaled = scaler.transform(X_struct_test.select_dtypes(include=['float64', 'int64']))

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_struct_train_encoded = encoder.fit_transform(X_struct_train.select_dtypes(include=['object']))
X_struct_test_encoded = encoder.transform(X_struct_test.select_dtypes(include=['object']))

# Combine scaled and encoded structured data
X_struct_train_final = np.hstack([X_struct_train_scaled, X_struct_train_encoded])
X_struct_test_final = np.hstack([X_struct_test_scaled, X_struct_test_encoded])

# Preprocessing for text data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_text_train)
X_text_train_seq = tokenizer.texts_to_sequences(X_text_train)
X_text_test_seq = tokenizer.texts_to_sequences(X_text_test)

max_seq_length = 100
X_text_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_text_train_seq, maxlen=max_seq_length, padding='post')
X_text_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_text_test_seq, maxlen=max_seq_length, padding='post')

# Define the NLP model
text_input = Input(shape=(max_seq_length,), name='text_input')
embedding = Embedding(input_dim=10000, output_dim=128)(text_input)
lstm = LSTM(64)(embedding)
text_output = Dense(32, activation='relu')(lstm)

# Define the structured data model
struct_input = Input(shape=(X_struct_train_final.shape[1],), name='struct_input')
dense1 = Dense(64, activation='relu')(struct_input)
dense2 = Dense(32, activation='relu')(dense1)
struct_output = Dense(16, activation='relu')(dense2)

# Combine the two models
combined = concatenate([text_output, struct_output])
dense_combined = Dense(64, activation='relu')(combined)
dropout = Dropout(0.5)(dense_combined)
output = Dense(1, activation='sigmoid', name='output')(dropout)

# Build the model
model = Model(inputs=[text_input, struct_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    [X_text_train_padded, X_struct_train_final], y_train,
    validation_data=([X_text_test_padded, X_struct_test_final], y_test),
    epochs=10, batch_size=32
)

# Evaluate the model
loss, accuracy = model.evaluate([X_text_test_padded, X_struct_test_final], y_test)
print(f"Test Accuracy: {accuracy:.2f}")



Epoch 1/10




[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.6681 - loss: 0.6468 - val_accuracy: 0.7390 - val_loss: 0.5809
Epoch 2/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7466 - loss: 0.5756 - val_accuracy: 0.7390 - val_loss: 0.5816
Epoch 3/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7332 - loss: 0.5983 - val_accuracy: 0.7390 - val_loss: 0.5695
Epoch 4/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.7427 - loss: 0.5783 - val_accuracy: 0.7356 - val_loss: 0.5713
Epoch 5/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7509 - loss: 0.5768 - val_accuracy: 0.7322 - val_loss: 0.5799
Epoch 6/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7289 - loss: 0.5935 - val_accuracy: 0.7356 - val_loss: 0.5773
Epoch 7/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━

# BERT + Neural Network

In [22]:
# Load and split the data
df_test = df.drop(columns=['Anxiety_Score', 'Depression_Score', 'Negative_Emotion'])
df_test.to_csv("df_test.csv", index=False)

# Define input features and target
text_data = df['Answer']
structured_data = df.drop(columns=['Answer', 'Anxiety_Score', 'Depression_Score', 'Negative_Emotion', 'At_Risk'])
target = df['At_Risk']

# Split into train and test sets
X_text_train, X_text_test, X_struct_train, X_struct_test, y_train, y_test = train_test_split(
    text_data, structured_data, target, test_size=0.2, random_state=42
)

# Preprocessing for structured data
scaler = StandardScaler()
X_struct_train_scaled = scaler.fit_transform(X_struct_train.select_dtypes(include=['float64', 'int64']))
X_struct_test_scaled = scaler.transform(X_struct_test.select_dtypes(include=['float64', 'int64']))

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_struct_train_encoded = encoder.fit_transform(X_struct_train.select_dtypes(include=['object']))
X_struct_test_encoded = encoder.transform(X_struct_test.select_dtypes(include=['object']))

# Combine scaled and encoded structured data
X_struct_train_final = np.hstack([X_struct_train_scaled, X_struct_train_encoded])
X_struct_test_final = np.hstack([X_struct_test_scaled, X_struct_test_encoded])

# Preprocessing for BERT
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

def encode_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        list(texts),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors="tf"
    )

X_text_train_encoded = encode_texts(X_text_train, tokenizer)
X_text_test_encoded = encode_texts(X_text_test, tokenizer)

# Load BERT model and tokenizer
bert_model_name = "bert-base-uncased"
bert_model = TFBertModel.from_pretrained(bert_model_name)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Define inputs for the Keras Functional API
text_input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
text_attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Manually wrap BERT model in a Lambda layer to process inputs
def bert_layer(inputs):
    input_ids, attention_mask = inputs
    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
    return outputs.pooler_output

# Add Lambda layer to preprocess inputs for BERT
bert_output = Lambda(
    bert_layer,
    output_shape=(768,),  # BERT's pooled output is 768-dimensional
)([text_input_ids, text_attention_mask])

text_output = Dense(64, activation="relu")(bert_output)

# Example structured data input
struct_input = Input(shape=(X_struct_train_final.shape[1],), name="struct_input")
struct_output = Dense(32, activation="relu")(struct_input)

# Combine text and structured data outputs
combined = concatenate([text_output, struct_output])
dense_combined = Dense(64, activation="relu")(combined)
dropout = Dropout(0.5)(dense_combined)
output = Dense(1, activation="sigmoid")(dropout)

# Build the model
model = Model(inputs=[text_input_ids, text_attention_mask, struct_input], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Print model summary
model.summary()


# Define the structured data model
struct_input = Input(shape=(X_struct_train_final.shape[1],), name='struct_input')
dense1 = Dense(64, activation='relu')(struct_input)
dense2 = Dense(32, activation='relu')(dense1)
struct_output = Dense(16, activation='relu')(dense2)

# Combine the two models
combined = concatenate([text_output, struct_output])
dense_combined = Dense(64, activation='relu')(combined)
dropout = Dropout(0.5)(dense_combined)
output = Dense(1, activation='sigmoid', name='output')(dropout)

# Build the combined model
model = Model(inputs=[text_input_ids, text_attention_mask, struct_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    [X_text_train_encoded['input_ids'], X_text_train_encoded['attention_mask'], X_struct_train_final],
    y_train,
    validation_data=(
        [X_text_test_encoded['input_ids'], X_text_test_encoded['attention_mask'], X_struct_test_final], y_test
    ),
    epochs=3, batch_size=16
)

# Evaluate the model
loss, accuracy = model.evaluate(
    [X_text_test_encoded['input_ids'], X_text_test_encoded['attention_mask'], X_struct_test_final],
    y_test
)
print(f"Test Accuracy: {accuracy:.2f}")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/3




[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 915ms/step - accuracy: 0.7043 - loss: 0.6220 - val_accuracy: 0.7390 - val_loss: 0.5870
Epoch 2/3
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 889ms/step - accuracy: 0.7266 - loss: 0.5885 - val_accuracy: 0.7390 - val_loss: 0.5708
Epoch 3/3
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 892ms/step - accuracy: 0.7131 - loss: 0.5870 - val_accuracy: 0.7390 - val_loss: 0.5530
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1s/step - accuracy: 0.7233 - loss: 0.5639
Test Accuracy: 0.74
