In [None]:
%%capture
!pip install --upgrade gensim
!gdown --folder 1rl_TJMHtcP-S0fTHNUpr9oi8Ndz2Y5z4 -O project_data
!gdown --folder 1RmC_We2lfsJontwrxlc4dzqEY_mNy-UW -O word2vec_model

In [1]:
!git clone https://github.com/AleksL04/ML-Project.git

Cloning into 'ML-Project'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
Receiving objects: 100% (3/3), 9.71 KiB | 9.71 MiB/s, done.
remote: Total 3 (delta 0), reused 3 (delta 0), pack-reused 0 (from 0)[K


In [2]:
%cd ML-Project

/content/ML-Project


In [3]:
ls -l

total 80
-rw-r--r-- 1 root root 80098 Dec 10 20:57 Ensemble_Models.ipynb


In [None]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
import spacy
from gensim.models import KeyedVectors
from tensorflow.keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Masking
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics.pairwise import cosine_distances

In [None]:
#import gensim.downloader as api
#model = api.load('glove-wiki-gigaword-100')
#save_path = '/content/drive/MyDrive/glove-wiki-gigaword-100.vectors'
#model.save(save_path)

In [None]:
train_df = pd.read_csv('project_data/train.csv')
valid_df = pd.read_csv('project_data/valid.csv')
test_df = pd.read_csv('project_data/test.csv')

In [None]:
train_df.head()
#

Unnamed: 0,text,label
0,states slow to shut down weak teacher educatio...,0
1,drone places fresh kill on steps of white house,1
2,report: majority of instances of people gettin...,1
3,"sole remaining lung filled with rich, satisfyi...",1
4,the gop's stockholm syndrome,0


In [None]:
model_path = 'word2vec_model/glove-wiki-gigaword-100.vectors'
word2vec_dict = KeyedVectors.load(model_path)

In [None]:
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "lemmatizer"])

In [None]:
def process_df(df, nlp):
    df['text'] = df['text'].str.lower()
    tokens_list = []
    for doc in nlp.pipe(df['text'].astype(str), batch_size=1000):
        tokens_list.append([token.text for token in doc if not token.is_space])

    df['text_split'] = tokens_list
    return df

def add_embedding(df, word2vec_dict):
    np.random.seed(42)
    random_vec = np.random.rand(100)*2-1

    embedding_list = []
    for tokens in df['text_split']:
        embedding = []
        for token in tokens:
            if token in word2vec_dict:
                embedding.append(word2vec_dict[token])
            else:
                embedding.append(random_vec)
        embedding_list.append(embedding)
    df['embedding'] = embedding_list
    return df

def prepare_X_y(df, max_len=50, vector_size=100):
    # Initialize a 3D matrix of zeros: (Num_Samples, Max_Time_Steps, Features)
    X = np.zeros((len(df), max_len, vector_size))

    # Fill the matrix
    for i, seq in enumerate(df['embedding']):
        # Truncate if longer than max_len
        length = min(len(seq), max_len)
        if length > 0:
            # Assign the sequence to the matrix (automatic padding at the end)
            X[i, :length, :] = np.array(seq)[:length]
    y = df['label'].values
    return X, y


In [None]:
train_clean = process_df(train_df, nlp)
valid_clean = process_df(valid_df, nlp)
test_clean = process_df(test_df, nlp)


In [None]:
train_clean = add_embedding(train_clean, word2vec_dict)
valid_clean = add_embedding(valid_clean, word2vec_dict)
test_clean = add_embedding(test_clean, word2vec_dict)

In [None]:
train_clean.head()

Unnamed: 0,text,label,text_split,embedding
0,states slow to shut down weak teacher educatio...,0,"[states, slow, to, shut, down, weak, teacher, ...","[[0.13815, 0.45166, 0.93858, 0.055307, 0.70642..."
1,drone places fresh kill on steps of white house,1,"[drone, places, fresh, kill, on, steps, of, wh...","[[-0.89403, 0.29261, 0.35079, 0.23718, -0.2033..."
2,report: majority of instances of people gettin...,1,"[report, :, majority, of, instances, of, peopl...","[[-0.59537, -0.32836, 0.58577, -0.74072, -0.09..."
3,"sole remaining lung filled with rich, satisfyi...",1,"[sole, remaining, lung, filled, with, rich, ,,...","[[0.29567, -0.95131, 0.26483, -0.014256, 0.356..."
4,the gop's stockholm syndrome,0,"[the, gop, 's, stockholm, syndrome]","[[-0.038194, -0.24487, 0.72812, -0.39961, 0.08..."


In [None]:
MAX_LEN = 50
VECTOR_SIZE = 100
X_train, y_train = prepare_X_y(train_clean, MAX_LEN, VECTOR_SIZE)
X_valid, y_valid = prepare_X_y(valid_clean, MAX_LEN, VECTOR_SIZE)
X_test, y_test   = prepare_X_y(test_clean, MAX_LEN, VECTOR_SIZE)

In [None]:
from tensorflow.keras.layers import GlobalMaxPooling1D

In [None]:
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

model = Sequential()

model.add(Bidirectional(LSTM(64, return_sequences=True, dropout=0.2)))

model.add(GlobalMaxPooling1D())

model.add(Dense(16, activation='relu')) # Intermediate dense layer
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 5. Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1, min_lr=1e-5)
]

# history = model.fit(
#     X_train, y_train,
#     validation_data=(X_valid, y_valid),
#     epochs=20, # Increased epochs because we have EarlyStopping
#     batch_size=32,
#     callbacks=callbacks
# )

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8928 - loss: 0.2253
Test Accuracy: 0.8996


In [None]:
# Train your first model as usual
history1 = model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=0, validation_data=(X_valid, y_valid), callbacks=callbacks)
print("Model 1 Trained.")

Model 1 Trained.


In [None]:
import numpy as np

# 1. Get predictions (returns shape (N, 1))
train_preds = model.predict(X_train)

# 2. Force both to be flat 1D arrays
# This prevents the massive grid creation
y_train_flat = y_train.flatten() if hasattr(y_train, 'flatten') else np.array(y_train).flatten()
preds_flat = train_preds.flatten()

# 3. Calculate Error safely
errors = np.abs(y_train_flat - preds_flat)

# 4. Create Weights
sample_weights = 1.0 + (errors * 10.0)

print(f"Weights shape: {sample_weights.shape}") # Should now be (21464,)

[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
Weights shape: (21464,)


In [None]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

# Let's use a slightly different architecture (CNN) for diversity
# CNNs look at local patterns, LSTMs look at sequences. They complement each other.
model2 = Sequential()
model2.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(8, activation='relu'))
model2.add(Dropout(0.5))
model2.add(Dense(1, activation='sigmoid'))

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model 2 with the calculated weights
history2 = model2.fit(
    X_train,
    y_train,
    sample_weight=sample_weights, # <--- THIS IS THE KEY
    validation_data=(X_valid, y_valid),
    epochs=20,
    batch_size=32,
    callbacks=callbacks
)

Epoch 1/20
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.4794 - loss: 1.4313 - val_accuracy: 0.4986 - val_loss: 0.6930 - learning_rate: 0.0010
Epoch 2/20
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5371 - loss: 1.3939 - val_accuracy: 0.6648 - val_loss: 0.6198 - learning_rate: 0.0010
Epoch 3/20
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6171 - loss: 1.3600 - val_accuracy: 0.8268 - val_loss: 0.5502 - learning_rate: 0.0010
Epoch 4/20
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7053 - loss: 1.2620 - val_accuracy: 0.7584 - val_loss: 0.5432 - learning_rate: 0.0010
Epoch 5/20
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7194 - loss: 1.1950 - val_accuracy: 0.8226 - val_loss: 0.4554 - learning_rate: 0.0010
Epoch 6/20
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [None]:
import numpy as np

# 1. Get predictions from both models on VALIDATION data
# These are inputs for the meta-learner
val_pred1 = model.predict(X_valid)
val_pred2 = model2.predict(X_valid)

# 2. Stack them side-by-side
# Result shape: (num_valid_samples, 2)
# Row 1 example: [0.98, 0.45] (Model 1 was confident, Model 2 wasn't)
X_meta_train = np.column_stack((val_pred1, val_pred2))

# The target is the original validation labels
y_meta_train = y_valid

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step


In [None]:
# A very simple "Blender" model
meta_model = Sequential()

# Input dim is 2 because we have 2 models feeding into it
meta_model.add(Dense(4,input_dim=2, activation='relu'))
meta_model.add(Dense(1, activation='sigmoid'))

# Compile
meta_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train on the outputs of the previous models
meta_model.fit(X_meta_train, y_meta_train, epochs=100, batch_size=32, verbose=1, callbacks=callbacks)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.4923 - loss: 0.6905 - learning_rate: 0.0010
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5049 - loss: 0.6833 - learning_rate: 0.0010
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5400 - loss: 0.6777 - learning_rate: 0.0010
Epoch 4/100


  current = self.get_monitor_value(logs)
  callback.on_epoch_end(epoch, logs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5607 - loss: 0.6730 - learning_rate: 0.0010
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6000 - loss: 0.6696 - learning_rate: 0.0010
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6310 - loss: 0.6688 - learning_rate: 0.0010
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6569 - loss: 0.6644 - learning_rate: 0.0010
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6745 - loss: 0.6568 - learning_rate: 0.0010
Epoch 9/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7170 - loss: 0.6520 - learning_rate: 0.0010
Epoch 10/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7363 - loss: 0.6472 - learning_rate: 0.0010
Epoch 11/1

<keras.src.callbacks.history.History at 0x794ea365b080>

In [None]:
# 1. Get base predictions on TEST set
test_pred1 = model.predict(X_test)
test_pred2 = model2.predict(X_test)

# 2. Stack them
X_meta_test = np.column_stack((test_pred1, test_pred2))

# 3. Final prediction using the Meta Model
final_predictions = meta_model.predict(X_meta_test)

print("Final Stacked Accuracy:")
meta_model.evaluate(X_meta_test, y_test) # Assuming you have y_test

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Final Stacked Accuracy:
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8956 - loss: 0.2489


[0.2545756697654724, 0.9006211161613464]