In [13]:
# General Libraries
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import time
import toml
import numpy as np

# Hugging Face Transformers and PyTorch
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import autocast

# Sklearn for Model Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample


In [17]:

#secrets = toml.load(r'C:\Users\aless\Desktop\final project\Final-Project---Luis-Augustin-Ale\.streamlit\secrets.toml')
df = pd.read_csv("hf://datasets/PrkhrAwsti/Twitter_Sentiment_3M/twitter_dataset.csv")

In [18]:
# take off nan, duplicates, 0 lenght tweets, tweets with neutral sentiment (2), convert sentiment value to integer
df=df.dropna()
df=df.drop_duplicates() 
df['tweet_length'] = df['tweet'].apply(lambda x: len(x.split()))
df = df[df['tweet_length'] > 0]
df = df[df['sentiment'] != 2]
df['sentiment'] = df['sentiment'].astype(int)


In [19]:
df.describe()

Unnamed: 0.1,Unnamed: 0,sentiment,tweet_length
count,3131484.0,3131484.0,3131484.0
mean,1565770.0,0.4986304,13.2885
std,904019.9,0.4999982,6.983048
min,0.0,0.0,1.0
25%,782870.8,0.0,7.0
50%,1565742.0,0.0,12.0
75%,2348648.0,1.0,19.0
max,3138702.0,1.0,64.0


In [20]:
print(df['sentiment'].value_counts()) 

sentiment
0    1570031
1    1561453
Name: count, dtype: int64


In [22]:
X = df['tweet']
y = df['sentiment']

# Train-Validation-Test split (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Output the sizes of each dataset
print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}, Test size: {len(X_test)}")


Train size: 2192038, Validation size: 469723, Test size: 469723


In [23]:
# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')



In [24]:
# tokenize tweets
def tokenize_tweets(tweets):
    return tokenizer(list(tweets), padding=True, truncation=True, max_length=128, return_tensors="pt")

# Tokenize each dataset
train_encodings = tokenize_tweets(X_train)
val_encodings = tokenize_tweets(X_val)
test_encodings = tokenize_tweets(X_test)

# Check sample tokenized data
print("Sample tokenized input IDs from the train set:", train_encodings['input_ids'][0])


MemoryError: 

In [None]:
# Saving tokenized data as PyTorch tensors
torch.save((train_encodings, torch.tensor(y_train.values)), 'train_encodings.pt')
torch.save((val_encodings, torch.tensor(y_val.values)), 'val_encodings.pt')
torch.save((test_encodings, torch.tensor(y_test.values)), 'test_encodings.pt')

print("Tokenized data saved to .pt files.")


## after model training, load the model to evaluate 

In [None]:
# Load the pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

In [5]:
# Modify the classifier to match the saved model structure using a Sequential layer.
model.classifier = torch.nn.Sequential(
    torch.nn.Dropout(0.3),  # Dropout as per the trained model
    torch.nn.Linear(model.config.hidden_size, 2)  # Linear layer for binary classification
)

In [26]:
# Load the fine-tuned model weights
model.load_state_dict(torch.load(r'C:\Users\Agustín\Desktop\4Geeks\Clases\30. Model\2nd_model.pth',map_location=torch.device('cpu')))

  model.load_state_dict(torch.load(r'C:\Users\Agustín\Desktop\4Geeks\Clases\30. Model\2nd_model.pth',map_location=torch.device('cpu')))


<All keys matched successfully>

In [27]:
# Set the model to evaluation mode
model.eval()

print("Model loaded and set to evaluation mode.")

Model loaded and set to evaluation mode.


In [28]:
# Define the device used to process data
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the device (GPU or CPU)
model.to(device)


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [29]:
# Define the batch size
batch_size = 512 #trying to maximize vram usage


In [None]:
# Load the tokenized validation dataset (make sure the path is correct)
val_encodings, val_labels = torch.load(r'C:/Users/aless/Desktop/final project/Final-Project---Luis-Augustin-Ale/notebooks/val_encodings.pt')


In [None]:

# Create the validation DataLoader
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=8)  # adding workers to imrpove cpu usage to avoid bottleneck (8 core = testing with 8 workers )


In [None]:
import time
from torch.cuda.amp import autocast  # Import for mixed precision

# Start timer for inference
start_time = time.time()

# Initialize lists to store predictions and true labels
all_preds = []
all_labels = []
y_proba = []  # Store probabilities for ROC-AUC

# Set model to no_grad mode for inference
with torch.no_grad():
    # Loop through validation DataLoader
    for step, batch in enumerate(val_loader):
        # Move batch to the device (GPU or CPU)
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        # Enable mixed precision for faster computation
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # Use only the logits for the [CLS] token (first token) for classification
            cls_logits = logits[:, 0, :]  # Extract the [CLS] token logits for each sequence
            
            # Predicted classes from [CLS] token
            predictions = torch.argmax(cls_logits, dim=-1)
            
            # Probabilities for ROC-AUC from [CLS] token
            probabilities = torch.softmax(cls_logits, dim=-1)[:, 1]  # Probabilities for class 1

        # Store predictions and true labels
        all_preds.extend(predictions.cpu().numpy())  # Store binary predictions for each sequence
        all_labels.extend(labels.cpu().numpy())  # Store actual labels
        y_proba.extend(probabilities.cpu().numpy())  # Store probabilities for ROC-AUC

# Total time taken for inference
total_time = time.time() - start_time
print(f"Total inference time: {total_time:.2f} seconds")


In [None]:
# Check the shape and some example values
print(f"Predictions (all_preds): {all_preds[:5]}")  # Print first 5 predictions
print(f"Labels (all_labels): {all_labels[:5]}")  # Print first 5 labels


In [40]:
# Flatten predictions (ensure it's a 1D array)
all_preds_flat = np.concatenate([pred.flatten() for pred in all_preds])

In [None]:
# Check shapes and values
print(f"Flattened Predictions (all_preds_flat): {all_preds_flat[:5]}")
print(f"Labels (all_labels): {all_labels[:5]}")

In [None]:
# Print a few predictions and their corresponding labels
print("Predictions sample:", all_preds[:10])
print("True labels sample:", all_labels[:10])

# Check distribution of predictions (if the model is predicting only one class)
unique_preds, counts_preds = np.unique(all_preds, return_counts=True)
pred_class_distribution = dict(zip(unique_preds, counts_preds))
print(f"Predicted Class Distribution: {pred_class_distribution}")


In [None]:
# Check the distribution of predictions
unique_preds, counts_preds = np.unique(all_preds, return_counts=True)
predictions_distribution = dict(zip(unique_preds, counts_preds))

print(f"Predictions Distribution: {predictions_distribution}")


In [None]:
# Generate classification report 
print("Classification Report:\n", classification_report(all_labels, all_preds))


In [None]:
# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)

# Plotting the confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()

In [None]:
# Calculate and print the ROC-AUC score
from sklearn.metrics import roc_auc_score

# Ensure you have the probabilities (y_proba) from the model inference step
roc_score = roc_auc_score(all_labels, y_proba)
print(f"ROC-AUC Score: {roc_score:.4f}")


In [None]:
accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
# Precision, recall, and F1-score for the positive class (class 1)
from sklearn.metrics import precision_score, recall_score, f1_score

precision_class_1 = precision_score(all_labels, all_preds, pos_label=1)
recall_class_1 = recall_score(all_labels, all_preds, pos_label=1)
f1_class_1 = f1_score(all_labels, all_preds, pos_label=1)

print(f"Precision (Positive Class): {precision_class_1:.4f}")
print(f"Recall (Positive Class): {recall_class_1:.4f}")
print(f"F1-Score (Positive Class): {f1_class_1:.4f}")


In [None]:
# Generate F1-score for both classes
from sklearn.metrics import f1_score

f1 = f1_score(all_labels, all_preds, average="weighted")
print(f"F1-Score (Weighted): {f1:.4f}")


In [None]:
# Initialize the tokenizer for the RoBERTa model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


def predict_sentiment(phrase):
    # Tokenize the input phrase and prepare it for the model
    inputs = tokenizer(phrase, padding=True, truncation=True, max_length=128, return_tensors="pt")

    # Move inputs to the same device as the model (e.g., GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Put the model in evaluation mode
    model.eval()

    # Disable gradient calculations for inference
    with torch.no_grad():
        # Forward pass to get model outputs
        outputs = model(**inputs)
        logits = outputs.logits

    # Squeeze the logits to remove extra dimensions (if batch size is 1)
    logits = logits.squeeze()

    # Extract the predicted class (0 for negative, 1 for positive)
    predicted_class = torch.argmax(logits).item()

    # Map the class to a label
    if predicted_class == 1:
        return "Positive"
    else:
        return "Negative"



In [None]:
input_phrase = "had me tea now put me feet up and chill out for a while me thinks "       # IMPUT PHRASE HERE!
result = predict_sentiment(input_phrase)
print(f"Sentiment: {result}")