In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import RandomizedSearchCV
import torch
import joblib
import time
import random
from tqdm import tqdm

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Load the labeled dataset
labeled_df = pd.read_csv('Stormfront_labeled.csv')
print("Labeled dataset loaded. Shape:", labeled_df.shape)

# Extract relevant columns
labeled_df = labeled_df[['Message', 'CM', 'AOPV', 'CDACT', 'TI', 'TTBF']]
print("Relevant columns from labeled dataset extracted. Shape:", labeled_df.shape)

# Load the unlabeled dataset in chunks and concatenate the required number of rows
chunk_size = 100000  # Adjust chunk size as needed
target_unlabeled_size = 50000  # Reduced target size for testing
chunks = []

for chunk in pd.read_csv('cleanposts.csv', chunksize=chunk_size):
    # Filter out non-English messages
    chunk = chunk[chunk['lang'] == 'en']
    chunks.append(chunk[['cleanmessage']].rename(columns={'cleanmessage': 'Message'}))
    
    # Check if we have reached the target size
    if sum(len(c) for c in chunks) >= target_unlabeled_size:
        break

unlabeled_df = pd.concat(chunks).head(target_unlabeled_size)
print("Unlabeled dataset loaded and sampled. Shape:", unlabeled_df.shape)


Labeled dataset loaded. Shape: (491, 42)
Relevant columns from labeled dataset extracted. Shape: (491, 6)
Unlabeled dataset loaded and sampled. Shape: (50000, 1)


In [15]:
from tqdm import tqdm

# Initialize tokenizer and model for feature extraction
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = AutoModel.from_pretrained('distilbert-base-uncased')

def get_features(texts, tokenizer, model, batch_size=64):
    features = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting Features"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=256)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_features = outputs.last_hidden_state.mean(dim=1).numpy()
        features.append(batch_features)
    return np.vstack(features)

# Extract features for labeled and unlabeled data in batches
labeled_features = get_features(labeled_df['Message'].tolist(), tokenizer, distilbert_model)
unlabeled_features = get_features(unlabeled_df['Message'].tolist(), tokenizer, distilbert_model)

# Convert to DataFrame for compatibility with model training
labeled_features_df = pd.DataFrame(labeled_features)
unlabeled_features_df = pd.DataFrame(unlabeled_features)

# Add target columns to the labeled features DataFrame
labeled_features_df[['CM', 'AOPV', 'CDACT', 'TI', 'TTBF']] = labeled_df[['CM', 'AOPV', 'CDACT', 'TI', 'TTBF']]
# Split the labeled data into training and validation sets
train_df, val_df = train_test_split(labeled_features_df, test_size=0.2, random_state=42)
print(f"Training data shape: {train_df.shape}, Validation data shape: {val_df.shape}")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Extracting Features: 100%|██████████| 8/8 [00:51<00:00,  6.47s/it]
Extracting Features: 100%|██████████| 782/782 [1:37:40<00:00,  7.49s/it]


Training data shape: (392, 773), Validation data shape: (99, 773)


In [16]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor(random_state=42)
print("RandomForestRegressor model initialized.")

# Define the parameter grid for RandomForestRegressor
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Initialize RandomizedSearchCV with fewer iterations
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=70, cv=3, n_jobs=1, verbose=2, random_state=42)
print("RandomizedSearchCV initialized with parameter grid.")

# Perform random search on training data
X_train = train_df.iloc[:, :-5]
y_train = train_df.iloc[:, -5:]
random_search.fit(X_train, y_train)
print("Random search completed. Best parameters:", random_search.best_params_)

# Update the model with best parameters
model = random_search.best_estimator_
print("Model updated with best parameters from random search.")


RandomForestRegressor model initialized.
RandomizedSearchCV initialized with parameter grid.
Fitting 3 folds for each of 70 candidates, totalling 210 fits
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   0.9s
[CV] 

In [17]:
def self_training(model, labeled_data, unlabeled_data, max_iter=10, threshold=0.99):
    for i in range(max_iter):
        print(f"Iteration {i+1} of self-training")
        
        start_time = time.time()
        
        # Train the model on the labeled data
        X_labeled = labeled_data.iloc[:, :-5]
        y_labeled = labeled_data.iloc[:, -5:]
        model.fit(X_labeled, y_labeled)
        print("Model trained on labeled data.")
        
        # Predict on the unlabeled data
        if unlabeled_data.shape[0] == 0:
            print("No more unlabeled data to process.")
            break

        pseudo_labels = model.predict(unlabeled_data)
        print(f"Pseudo-labels predicted for {unlabeled_data.shape[0]} unlabeled samples.")
        
        # Select the pseudo-labeled data with high confidence
        high_confidence_mask = (pseudo_labels >= threshold).all(axis=1)
        high_confidence_pseudo_labels = pseudo_labels[high_confidence_mask]
        high_confidence_unlabeled = unlabeled_data[high_confidence_mask]
        
        if high_confidence_unlabeled.shape[0] == 0:
            print("No high-confidence pseudo-labeled data found.")
            break
        
        # Add the high-confidence pseudo-labeled data to the labeled dataset
        pseudo_labeled_data = high_confidence_unlabeled.copy()
        pseudo_labeled_data[['CM', 'AOPV', 'CDACT', 'TI', 'TTBF']] = high_confidence_pseudo_labels
        labeled_data = pd.concat([labeled_data, pseudo_labeled_data])
        print(f"Added {high_confidence_pseudo_labels.shape[0]} high-confidence pseudo-labeled samples to labeled data.")
        print(f"New size of labeled data: {labeled_data.shape[0]}")
        
        # Remove the high-confidence pseudo-labeled data from the unlabeled dataset
        unlabeled_data = unlabeled_data[~high_confidence_mask]
        print(f"Remaining unlabeled data: {unlabeled_data.shape[0]}")
        
        end_time = time.time()
        iteration_time = end_time - start_time
        print(f"Time taken for iteration {i+1}: {iteration_time:.2f} seconds")
        
    return model

# Perform self-training
trained_model = self_training(model, train_df, unlabeled_features_df)


Iteration 1 of self-training
Model trained on labeled data.
Pseudo-labels predicted for 50000 unlabeled samples.
Added 50000 high-confidence pseudo-labeled samples to labeled data.
New size of labeled data: 50392
Remaining unlabeled data: 0
Time taken for iteration 1: 5.43 seconds
Iteration 2 of self-training
Model trained on labeled data.
No more unlabeled data to process.


In [18]:
# Evaluate the model
X_val = val_df.iloc[:, :-5]
y_val = val_df.iloc[:, -5:]
val_predictions = trained_model.predict(X_val)

val_mse = mean_squared_error(y_val, val_predictions)
val_r2 = r2_score(y_val, val_predictions)
val_mae = mean_absolute_error(y_val, val_predictions)
print(f'Validation MSE: {val_mse}')
print(f'Validation R2: {val_r2}')
print(f'Validation MAE: {val_mae}')

# Save the trained model
joblib.dump(trained_model, 'pretrain_model.pkl')
print("Trained model saved as 'pretrain_model.pkl'.")


Validation MSE: 0.84731870364731
Validation R2: 0.27168180978296697
Validation MAE: 0.697058545950409
Trained model saved as 'pretrain_model.pkl'.


In [19]:
# Function to predict hate categories for new messages
def predict_hate_categories(trained_model, messages, tokenizer, distilbert_model):
    message_features = get_features(messages, tokenizer, distilbert_model)
    predictions = trained_model.predict(message_features)
    return predictions

# Example usage
new_messages = ["I hate people"]
predictions = predict_hate_categories(trained_model, new_messages, tokenizer, distilbert_model)
print("Predictions for new messages:", predictions)


Extracting Features: 100%|██████████| 1/1 [00:00<00:00,  5.96it/s]

Predictions for new messages: [[2.01394519 1.79953333 3.91033963 2.74979685 1.78728537]]





In [20]:
# Print predicted vs actual labels for a sample from the validation set
sample_index = np.random.choice(len(X_val), 1)[0]
sample_message = val_df.iloc[sample_index, :-5]
sample_actual = y_val.iloc[sample_index]
sample_predicted = val_predictions[sample_index]

print(f"\nValidation Example:\nMessage: {labeled_df.iloc[sample_index]['Message']}")
print(f"Predicted Labels: CM={sample_predicted[0]}, AOPV={sample_predicted[1]}, CDACT={sample_predicted[2]}, TI={sample_predicted[3]}, TTBF={sample_predicted[4]}")
print(f"Actual Labels: CM={sample_actual['CM']}, AOPV={sample_actual['AOPV']}, CDACT={sample_actual['CDACT']}, TI={sample_actual['TI']}, TTBF={sample_actual['TTBF']}")



Validation Example:
Message: Boy, wait till the nig-nogs get off work and pass the word to their dads.  
 
Sherman will blush at the way ATL burns this round.
Predicted Labels: CM=2.291321111104616, AOPV=1.7392851852121831, CDACT=4.1120148148013085, TI=3.131229814840586, TTBF=1.7796501851939706
Actual Labels: CM=4.0, AOPV=1.0, CDACT=4.0, TI=2.333333333, TTBF=1.666666667
