In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.stats import pearsonr
from transformers import BertTokenizer, BertModel
import torch

data = pd.read_csv('/content/MTurk_human_suspicion_survey_all.csv')

data = data.dropna(subset=['Input.text', 'Answer.judgement.label'])


# Convert to DataFrame
df = data[['Input.text', 'Answer.judgement.label']]
df.columns = ['text', 'suspicious']

# Initialize BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Function to get BERT embeddings
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        # Using the [CLS] token's embedding as representation
        cls_embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
        embeddings.append(cls_embedding)
    return np.array(embeddings)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
# Get BERT embeddings for the dataset
X = get_bert_embeddings(df['text'].values)
y = df['suspicious'].values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [5]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best estimator
best_clf = grid_search.best_estimator_

In [6]:
# Predictions and evaluation
y_pred_proba = best_clf.predict_proba(X_test)  # Predicted probabilities for each class

# Assuming y_test contains strings that need to be mapped to numerical labels
# Create a mapping of unique labels to numerical values
unique_labels = np.unique(y_test)
label_mapping = {label: i for i, label in enumerate(unique_labels)}

# Convert y_test to numerical labels using the mapping
y_test_numerical = np.array([label_mapping[label] for label in y_test])

# Ensure y_test values are within the valid range of class indices
y_test_adjusted = np.clip(y_test_numerical, 0, y_pred_proba.shape[1] - 1)
true_class_proba = [y_pred_proba[i][y_test_adjusted[i]] for i in range(len(y_test))]

# Calculate overall Pearson's correlation
correlation, _ = pearsonr(true_class_proba, y_test_numerical)  # Use numerical labels for correlation

# Results
accuracy = accuracy_score(y_test, clf.predict(X_test))
conf_matrix = confusion_matrix(y_test, clf.predict(X_test))

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Overall Pearson's Correlation:", correlation)

Accuracy: 0.3537477148080439
Confusion Matrix:
 [[ 31  95   3   6  62]
 [ 31 184  12   5 155]
 [  6  30   3   3  25]
 [  7  39   2   7  27]
 [ 25 151  16   7 162]]
Overall Pearson's Correlation: 0.12324158005158717
