In [7]:
# Cell 1: Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
import torch
import joblib
import random
from tqdm import tqdm
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)


<torch._C.Generator at 0x7f84c0b2e1b0>

In [8]:
# Compile the regular expression for removing special characters and punctuation
special_char_re = re.compile(r'[^a-zA-Z0-9\s]')

# Download necessary NLTK resources if not already downloaded
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Predefine the stop words
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Cleans the input text by converting to lowercase, removing special characters and punctuation,
    tokenizing, removing stop words, and joining the tokens back into a string.
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and punctuation
    text = special_char_re.sub('', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [9]:
# Cell 3: Load the labeled dataset and clean it
labeled_df = pd.read_csv('Stormfront_labeled.csv')
print("Labeled dataset loaded. Shape:", labeled_df.shape)

# Extract relevant columns
labeled_df = labeled_df[['Message', 'CM', 'AOPV', 'CDACT', 'TI', 'TTBF']]
print("Relevant columns from labeled dataset extracted. Shape:", labeled_df.shape)

# Clean the messages
labeled_df['Message'] = labeled_df['Message'].apply(clean_text)
print("Messages cleaned.")


Labeled dataset loaded. Shape: (491, 42)
Relevant columns from labeled dataset extracted. Shape: (491, 6)
Messages cleaned.


In [10]:
# Cell 4: Load the unlabeled dataset in chunks and clean it
chunk_size = 100000  # Adjust chunk size as needed
target_unlabeled_size = 5000  # Increased target size for testing
chunks = []

for chunk in pd.read_csv('cleanposts.csv', chunksize=chunk_size):
    # Filter out non-English messages
    chunk = chunk[chunk['lang'] == 'en']
    chunks.append(chunk[['cleanmessage']].rename(columns={'cleanmessage': 'Message'}))
    
    # Check if we have reached the target size
    if sum(len(c) for c in chunks) >= target_unlabeled_size:
        break

unlabeled_df = pd.concat(chunks).head(target_unlabeled_size)
unlabeled_df['Message'] = unlabeled_df['Message'].apply(clean_text)
print("Unlabeled dataset loaded and sampled. Shape:", unlabeled_df.shape)


Unlabeled dataset loaded and sampled. Shape: (5000, 1)


In [11]:
# Cell 5: TF-IDF Vectorization with hyperparameter tuning
from sklearn.metrics import make_scorer
from sklearn.metrics.pairwise import cosine_similarity

# Custom scoring function for TF-IDF Vectorizer
def tfidf_scorer(estimator, X):
    X_transformed = estimator.transform(X)
    score = np.mean(cosine_similarity(X_transformed))
    return score

# Define a function to manually perform TF-IDF Vectorization with hyperparameter tuning
def tune_tfidf_vectorizer(texts):
    param_grid = {
        'max_features': [5000, 10000, 20000],
        'ngram_range': [(1, 1), (1, 2)],
        'min_df': [5, 10],
        'max_df': [0.85, 0.9]
    }
    tfidf = TfidfVectorizer()
    grid_search = GridSearchCV(tfidf, param_grid, cv=3, n_jobs=-1, verbose=2, scoring=make_scorer(tfidf_scorer))
    grid_search.fit(texts)  # Only fit on texts
    return grid_search.best_estimator_

# Perform TF-IDF Vectorization with tuning on the labeled dataset
tfidf_vectorizer = tune_tfidf_vectorizer(labeled_df['Message'])
tfidf_features_labeled = tfidf_vectorizer.fit_transform(labeled_df['Message']).toarray()
tfidf_features_unlabeled = tfidf_vectorizer.transform(unlabeled_df['Message']).toarray()


Fitting 3 folds for each of 24 candidates, totalling 72 fits


  pid = os.fork()
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'



[CV] END max_df=0.85, max_features=5000, min_df=5, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=5000, min_df=5, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=5000, min_df=5, ngram_range=(1, 1); total time=   0.1s


Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'



[CV] END max_df=0.85, max_features=10000, min_df=5, ngram_range=(1, 1); total time=   0.2s
[CV] END max_df=0.85, max_features=10000, min_df=5, ngram_range=(1, 1); total time=   0.2s
[CV] END max_df=0.85, max_features=10000, min_df=5, ngram_range=(1, 1); total time=   0.1s


  from pandas.core import (


[CV] END max_df=0.85, max_features=10000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.85, max_features=10000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.85, max_features=10000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.85, max_features=10000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=10000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=10000, min_df=10, ngram_range=(1, 1); total time=   0.0s
[CV] END max_df=0.85, max_features=5000, min_df=5, ngram_range=(1, 2); total time=   0.2s


  from pandas.core import (
  from pandas.core import (
Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

  from pandas.core import (
  from pandas.core import (


[CV] END max_df=0.85, max_features=10000, min_df=10, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.85, max_features=10000, min_df=10, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.85, max_features=10000, min_df=10, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.85, max_features=20000, min_df=5, ngram_range=(1, 1); total time=   0.0s
[CV] END max_df=0.85, max_features=20000, min_df=5, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=5000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.85, max_features=20000, min_df=5, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=5000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.85, max_features=20000, min_df=10, ngram_range=(1, 1); total time=   0.1s


Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

  from pandas.core import (
Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'



[CV] END max_df=0.85, max_features=20000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=20000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.85, max_features=20000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=20000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.85, max_features=5000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=20000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.9, max_features=5000, min_df=5, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.9, max_features=5000, min_df=5, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.9, max_features=5000, min_df=5, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=20000, min_df=10, ngram_range=(1, 2); total time=   0.3s
[CV] END max_df=0.85, max_features=20000, min_df=10, ngram_range=(1, 2); total time=   0.3s
[

Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'



[CV] END max_df=0.9, max_features=10000, min_df=5, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.9, max_features=5000, min_df=10, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.9, max_features=5000, min_df=10, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.9, max_features=5000, min_df=10, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.9, max_features=10000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.9, max_features=10000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.9, max_features=10000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.9, max_features=10000, min_df=5, ngram_range=(1, 2); total time=   0.3s
[CV] END max_df=0.9, max_features=20000, min_df=5, ngram_range=(1, 1); total time=   0.0s
[CV] END max_df=0.85, max_features=5000, min_df=10, ngram_range=(1, 2); total time=   0.3s
[CV] END max_df=0.9, max_features=20000, min_df=5, ngram_range=(1, 1); total time=   0.1s
[CV] E

Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/Users/aryan/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'



[CV] END max_df=0.9, max_features=20000, min_df=5, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.9, max_features=20000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.85, max_features=5000, min_df=10, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.9, max_features=10000, min_df=10, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.9, max_features=20000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.9, max_features=20000, min_df=10, ngram_range=(1, 1); total time=   0.1s
[CV] END max_df=0.9, max_features=10000, min_df=10, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.9, max_features=10000, min_df=10, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.9, max_features=20000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.9, max_features=20000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV] END max_df=0.9, max_features=20000, min_df=5, ngram_range=(1, 2); total time=   0.2s
[CV

 nan nan nan nan nan nan]


In [12]:
# Cell 6: BERT Feature Extraction
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = AutoModel.from_pretrained('distilbert-base-uncased')

def get_bert_features(texts, tokenizer, model, batch_size=32):
    features = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting BERT Features"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=256)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_features = outputs.last_hidden_state.mean(dim=1).numpy()
        features.append(batch_features)
    return np.vstack(features)

# Extract BERT features
bert_features_labeled = get_bert_features(labeled_df['Message'].tolist(), tokenizer, distilbert_model)
bert_features_unlabeled = get_bert_features(unlabeled_df['Message'].tolist(), tokenizer, distilbert_model)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Extracting BERT Features: 100%|██████████| 16/16 [00:49<00:00,  3.12s/it]
Extracting BERT Features: 100%|██████████| 157/157 [07:07<00:00,  2.72s/it]


In [13]:
# Cell 7: Combine TF-IDF and BERT features
labeled_features = np.hstack([tfidf_features_labeled, bert_features_labeled])
unlabeled_features = np.hstack([tfidf_features_unlabeled, bert_features_unlabeled])

# Convert to DataFrame for compatibility with model training
labeled_features_df = pd.DataFrame(labeled_features)
unlabeled_features_df = pd.DataFrame(unlabeled_features)

# Add target columns to the labeled features DataFrame
labeled_features_df[['CM', 'AOPV', 'CDACT', 'TI', 'TTBF']] = labeled_df[['CM', 'AOPV', 'CDACT', 'TI', 'TTBF']]
# Split the labeled data into training and validation sets
train_df, val_df = train_test_split(labeled_features_df, test_size=0.2, random_state=42)
print(f"Training data shape: {train_df.shape}, Validation data shape: {val_df.shape}")

# Save train and validation data to disk
train_df.to_csv('train_df.csv', index=False)
val_df.to_csv('val_df.csv', index=False)


Training data shape: (392, 1562), Validation data shape: (99, 1562)


In [14]:
# Cell 8: Define parameter grids for the models
param_grid_rf = {
    'estimator__n_estimators': [100, 200, 300, 500, 1000],
    'estimator__max_depth': [10, 20, 30, 50],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4, 8],
    'estimator__max_features': ['sqrt', 'log2']
}

param_grid_gb = {
    'estimator__n_estimators': [100, 200, 300, 500, 1000],
    'estimator__max_depth': [3, 5, 7, 9, 12],
    'estimator__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3]
}

# Initialize the models
rf_model = MultiOutputRegressor(RandomForestRegressor(random_state=42))
gb_model = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))


In [15]:
# Cell 9: Hyperparameter tuning using RandomizedSearchCV
rf_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid_rf, n_iter=10, cv=3, n_jobs=-1, random_state=42, verbose=2)
gb_search = RandomizedSearchCV(estimator=gb_model, param_distributions=param_grid_gb, n_iter=10, cv=3, n_jobs=-1, random_state=42, verbose=2)

# Load training data from disk
train_df = pd.read_csv('train_df.csv')
X_train = train_df.iloc[:, :-5]
y_train = train_df.iloc[:, -5:].values  # Ensure y_train is a 2D array

print("Starting RandomizedSearchCV for RandomForestRegressor...")
rf_search.fit(X_train, y_train)
print("RandomForestRegressor best parameters:", rf_search.best_params_)

print("Starting RandomizedSearchCV for GradientBoostingRegressor...")
gb_search.fit(X_train, y_train)
print("GradientBoostingRegressor best parameters:", gb_search.best_params_)


Starting RandomizedSearchCV for RandomForestRegressor...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


  pid = os.fork()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


[CV] END estimator__max_depth=30, estimator__max_features=sqrt, estimator__min_samples_leaf=4, estimator__min_samples_split=5, estimator__n_estimators=300; total time=  10.8s
[CV] END estimator__max_depth=30, estimator__max_features=sqrt, estimator__min_samples_leaf=4, estimator__min_samples_split=5, estimator__n_estimators=300; total time=  10.8s
[CV] END estimator__max_depth=30, estimator__max_features=sqrt, estimator__min_samples_leaf=4, estimator__min_samples_split=5, estimator__n_estimators=300; total time=  10.9s
[CV] END estimator__max_depth=10, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=10, estimator__n_estimators=500; total time=  11.6s
[CV] END estimator__max_depth=10, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=10, estimator__n_estimators=500; total time=  11.6s
[CV] END estimator__max_depth=10, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split

In [16]:
# Cell 10: Update models with best parameters and fit MultiOutputRegressor
rf_model = rf_search.best_estimator_
gb_model = gb_search.best_estimator_

# Initialize MultiOutputRegressor for each base model with best parameters
rf_multioutput = rf_model
gb_multioutput = gb_model

# Fit the MultiOutputRegressor models
print("Fitting MultiOutputRegressor for RandomForestRegressor...")
rf_multioutput.fit(X_train, y_train)

print("Fitting MultiOutputRegressor for GradientBoostingRegressor...")
gb_multioutput.fit(X_train, y_train)


Fitting MultiOutputRegressor for RandomForestRegressor...
Fitting MultiOutputRegressor for GradientBoostingRegressor...


In [17]:
# Cell 11: Define Custom Stacking Regressor for MultiOutput
class CustomMultiOutputStackingRegressor:
    def __init__(self, estimators, final_estimator, cv=5, n_jobs=None):
        self.estimators = estimators
        self.final_estimator = final_estimator
        self.cv = cv
        self.n_jobs = n_jobs
        self.multi_output_estimators = [MultiOutputRegressor(estimator) for name, estimator in estimators]

    def fit(self, X, y):
        self.multi_output_estimators_ = [estimator.fit(X, y) for estimator in self.multi_output_estimators]
        meta_features = np.column_stack([estimator.predict(X) for estimator in self.multi_output_estimators_])
        self.final_estimator_ = MultiOutputRegressor(self.final_estimator).fit(meta_features, y)
        return self

    def predict(self, X):
        meta_features = np.column_stack([estimator.predict(X) for estimator in self.multi_output_estimators_])
        return self.final_estimator_.predict(meta_features)


In [18]:
# Cell 12: Define and fit the stacking regressor
stacking_regressor = CustomMultiOutputStackingRegressor(
    estimators=[
        ('rf', rf_model.estimator),
        ('gb', gb_model.estimator)
    ],
    final_estimator=RidgeCV(),
    cv=5,
    n_jobs=-1
)

stacking_regressor.fit(X_train, y_train)


<__main__.CustomMultiOutputStackingRegressor at 0x7f84b1787340>

In [19]:
# Cell 13: Evaluate the stacking regressor
X_val = val_df.iloc[:, :-5]
y_val = val_df.iloc[:, -5:].values  # Ensure y_val is a 2D array

# Ensure the stacking_regressor is fitted
if not hasattr(stacking_regressor, 'final_estimator_'):
    print("Stacking regressor is not fitted. Fit the model before prediction.")
else:
    val_predictions = stacking_regressor.predict(X_val)

    val_mse = mean_squared_error(y_val, val_predictions, multioutput='uniform_average')
    val_r2 = r2_score(y_val, val_predictions, multioutput='uniform_average')
    val_mae = mean_absolute_error(y_val, val_predictions, multioutput='uniform_average')
    print(f'Stacking Regressor Validation MSE: {val_mse}')
    print(f'Stacking Regressor Validation R2: {val_r2}')
    print(f'Stacking Regressor Validation MAE: {val_mae}')

    # Save the trained stacking regressor
    joblib.dump(stacking_regressor, 'ensemblev2.pkl')
    print("Trained stacking regressor model saved as 'ensemblev2'.")




Stacking Regressor Validation MSE: 0.7589455896724522
Stacking Regressor Validation R2: 0.3691344890493595
Stacking Regressor Validation MAE: 0.610267811175733
Trained stacking regressor model saved as 'ensemblev2'.


In [22]:
# Cell 14: Pseudo-labeling logic with iterative threshold approach
pseudo_labels_threshold = 0.9  # Define a threshold for confident predictions

def pseudo_labeling(unlabeled_features, model, threshold):
    pseudo_labels = model.predict(unlabeled_features)
    confidences = np.max(pseudo_labels, axis=1)
    high_confidence_indices = np.where(confidences >= threshold)[0]
    return pseudo_labels[high_confidence_indices], high_confidence_indices

# Apply pseudo-labeling
pseudo_labels, high_confidence_indices = pseudo_labeling(unlabeled_features_df.values, stacking_regressor, pseudo_labels_threshold)

# Create a DataFrame with high confidence pseudo-labeled data
pseudo_labeled_df = unlabeled_df.iloc[high_confidence_indices].copy()
pseudo_labeled_df[['CM', 'AOPV', 'CDACT', 'TI', 'TTBF']] = pseudo_labels

# Combine pseudo-labeled data with original labeled data
combined_df = pd.concat([labeled_features_df, pseudo_labeled_df])
X_combined = combined_df.iloc[:, :-5].values
y_combined = combined_df.iloc[:, -5:].values

# Check for and handle NaN values
print("Checking for NaN values in X_combined and y_combined...")
if np.isnan(X_combined).any() or np.isnan(y_combined).any():
    print("NaN values detected. Handling missing values...")
    # Impute missing values with the mean of the column
    from sklearn.impute import SimpleImputer
    imputer_X = SimpleImputer(strategy='mean')
    imputer_y = SimpleImputer(strategy='mean')
    X_combined = imputer_X.fit_transform(X_combined)
    y_combined = imputer_y.fit_transform(y_combined)

# Re-train the model with the combined dataset
stacking_regressor.fit(X_combined, y_combined)

# Evaluate the re-trained model
val_predictions = stacking_regressor.predict(X_val)

val_mse = mean_squared_error(y_val, val_predictions, multioutput='uniform_average')
val_r2 = r2_score(y_val, val_predictions, multioutput='uniform_average')
val_mae = mean_absolute_error(y_val, val_predictions, multioutput='uniform_average')
print(f'Updated Stacking Regressor Validation MSE: {val_mse}')
print(f'Updated Stacking Regressor Validation R2: {val_r2}')
print(f'Updated Stacking Regressor Validation MAE: {val_mae}')

# Save the re-trained stacking regressor
joblib.dump(stacking_regressor, 'ensemblev2_retrained.pkl')
print("Re-trained stacking regressor model saved as 'ensemblev2_retrained'.")




Checking for NaN values in X_combined and y_combined...
NaN values detected. Handling missing values...


ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'critic declining standards national health service died given large overdose iron hospital doctor read instructions drugs label properly carys pugh 63 former president patients association wales taken casualty royal glamorgan hospital blunder turned skin brown saturated liver iron fought survival hospital seven weeks mrs pugh suffered heart attack contracted deep vein thrombosis legs chest infection ecoli finally suffered second heart attack killed httpwwwtelegraphcouknewsmain26ixhomehtml'

In [None]:
# Function to predict hate categories for new messages
def predict_hate_categories(model, messages, tokenizer, distilbert_model, tfidf_vectorizer):
    tfidf_features = tfidf_vectorizer.transform(messages).toarray()
    bert_features = get_bert_features(messages, tokenizer, distilbert_model)
    combined_features = np.hstack([tfidf_features, bert_features])
    predictions = model.predict(combined_features)
    return predictions

# Example usage
new_messages = ["I hate people"]
predictions = predict_hate_categories(stacking_regressor, new_messages, tokenizer, distilbert_model, tfidf_vectorizer)
print("Predictions for new messages:", predictions)

# Print predicted vs actual labels for a sample from the validation set
sample_index = np.random.choice(len(X_val), 1)[0]
sample_message = val_df.iloc[sample_index, :-5]
sample_actual = y_val[sample_index]
sample_predicted = val_predictions[sample_index]

print(f"\nValidation Example:\nMessage: {labeled_df.iloc[sample_index]['Message']}")
print(f"Predicted Labels: CM={sample_predicted[0]}, AOPV={sample_predicted[1]}, CDACT={sample_predicted[2]}, TI={sample_predicted[3]}, TTBF={sample_predicted[4]}")
print(f"Actual Labels: CM={sample_actual[0]}, AOPV={sample_actual[1]}, CDACT={sample_actual[2]}, TI={sample_actual[3]}, TTBF={sample_actual[4]}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Define target names
target_names = ['CM', 'AOPV', 'CDACT', 'TI', 'TTBF']

# Function to plot actual vs predicted values
def plot_actual_vs_predicted(y_actual, y_predicted, target_names, title):
    fig, axs = plt.subplots(1, y_actual.shape[1], figsize=(20, 5))
    fig.suptitle(title)
    for i in range(y_actual.shape[1]):
        sns.scatterplot(x=y_actual[:, i], y=y_predicted[:, i], ax=axs[i])
        axs[i].set_xlabel('Actual')
        axs[i].set_ylabel('Predicted')
        axs[i].set_title(f'{target_names[i]}')
    plt.show()

# Function to plot residuals
def plot_residuals(y_actual, y_predicted, target_names, title):
    fig, axs = plt.subplots(1, y_actual.shape[1], figsize=(20, 5))
    fig.suptitle(title)
    for i in range(y_actual.shape[1]):
        residuals = y_actual[:, i] - y_predicted[:, i]
        sns.histplot(residuals, kde=True, ax=axs[i])
        axs[i].set_xlabel('Residuals')
        axs[i].set_title(f'{target_names[i]}')
    plt.show()

# Predict on validation set
val_predictions = stacking_regressor.predict(X_val)

# Visualize actual vs predicted values
plot_actual_vs_predicted(y_val, val_predictions, target_names, 'Actual vs Predicted Values')

# Visualize residuals
plot_residuals(y_val, val_predictions, target_names, 'Residuals Distribution')
