# Verify Python and Jupyter Installation

This notebook will guide you through verifying your Python and Jupyter installation.

In [20]:
# Reset kernel state
from IPython.display import clear_output
clear_output(wait=True)

In [21]:
import transformers
import accelerate
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

Transformers version: 4.51.3
Accelerate version: 1.6.0


In [22]:
from transformers import Trainer
from accelerate import Accelerator
print("Imports successful!")

Imports successful!


## Verify Python Installation

Use the `!python --version` command to check the installed Python version.

In [23]:
# Verify Python Installation
!python --version

Python 3.13.3


## Verify Jupyter Installation

Use the `!jupyter --version` command to check the installed Jupyter version.

In [24]:
# Verify Jupyter Installation
!jupyter --version

Selected Jupyter core packages...
IPython          : 9.2.0
ipykernel        : 6.29.5
ipywidgets       : not installed
jupyter_client   : 8.6.3
jupyter_core     : 5.7.2
jupyter_server   : not installed
jupyterlab       : not installed
nbclient         : not installed
nbconvert        : not installed
nbformat         : not installed
notebook         : not installed
qtconsole        : not installed
traitlets        : 5.14.3


## Run a Test Python Script in Jupyter

Write and execute a simple Python script, such as printing 'Hello, Jupyter!', to ensure everything is working correctly.

In [25]:
# Run a Test Python Script
print("Hello, Jupyter!")

Hello, Jupyter!


In [26]:
# Add a delay to prevent resource contention
import time
time.sleep(2)

# Sentiment Analysis Model Training

This notebook will guide you through the process of training a sentiment analysis model using Python and scikit-learn.

## Step 1: Import Required Libraries

We will use libraries like pandas, scikit-learn, and joblib for data handling, model training, and saving the trained model.

In [27]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import joblib

## Step 2: Load and Explore the Dataset

Load the dataset containing text and labels for sentiment analysis. Ensure the dataset has 'text' and 'label' columns.

In [28]:
# Load the dataset with the correct path
data = pd.read_csv('backend/training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
# Rename dataset columns to match expected names
data.columns = ['label', 'id', 'date', 'query', 'user', 'text']
print(data.head())

   label          id                          date     query             user  \
0      0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1      0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2      0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3      0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4      0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   

                                                text  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can't update his Facebook by ...  
2  @Kenichan I dived many times for the ball. Man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  


## Step 2.1: Inspect Dataset Columns

Check the column names in the dataset to identify the correct columns for text and labels.

In [29]:
# Inspect dataset columns
print(data.columns)

Index(['label', 'id', 'date', 'query', 'user', 'text'], dtype='object')


## Step 2.2: Rename Dataset Columns

Rename the dataset columns to make them more meaningful and use the correct columns for text and labels.

In [30]:
# Rename dataset columns
data.columns = ['label', 'id', 'date', 'query', 'user', 'text']
print(data.head())

   label          id                          date     query             user  \
0      0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1      0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2      0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3      0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4      0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   

                                                text  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can't update his Facebook by ...  
2  @Kenichan I dived many times for the ball. Man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  


## Step 2.3: Use Correct Columns for Text and Labels

Update the code to use the renamed columns for text and labels.

In [31]:
# Use the renamed columns
X = data['text']
y = data['label']

## Step 3: Preprocess the Data

Convert text data into numerical features using a PyTorch-based tokenizer.

In [32]:
# Preprocess the data using DistilBERT tokenizer
import torch
import time
from transformers import DistilBertTokenizer

# Check for GPU availability and configure CUDA for better performance
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.is_available():
    # Set CUDA optimizations
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    
    # Print CUDA device information
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"CUDA Capability: {torch.cuda.get_device_capability()}")

print(f"Using device for tokenization: {device}")

# Load pre-trained DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Measure tokenization time
start_time = time.time()

# Sample the data to make it more manageable
sample_size = min(100000, len(data))
sampled_data = data.sample(n=sample_size, random_state=42)
print(f"Processing {sample_size} samples out of {len(data)} total samples")

# Tokenize the dataset
X = sampled_data['text'].tolist()
y = sampled_data['label'].tolist()

# Set batch size for processing
batch_size = 1024
num_batches = (len(X) + batch_size - 1) // batch_size

print("Starting tokenization in batches...")
all_input_ids = []
all_attention_mask = []

# Process in batches for better GPU utilization
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(X))
    batch = X[start_idx:end_idx]
    
    # Display progress every 10 batches
    if i % 10 == 0:
        print(f"Processing batch {i+1}/{num_batches} ({(i+1)/num_batches*100:.1f}%)")
    
    # Tokenize batch
    batch_encodings = tokenizer(batch, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
    
    # Move to GPU if available (for faster operations)
    batch_encodings = {k: v.to(device) for k, v in batch_encodings.items()}
    
    # Store results
    all_input_ids.append(batch_encodings['input_ids'].cpu())  # Move back to CPU for storage
    all_attention_mask.append(batch_encodings['attention_mask'].cpu())

# Combine batches
encodings = {
    'input_ids': torch.cat(all_input_ids, dim=0),
    'attention_mask': torch.cat(all_attention_mask, dim=0)
}
labels = torch.tensor(y)

# Move final tensors to GPU for downstream processing
encodings = {k: v.to(device) for k, v in encodings.items()}
labels = labels.to(device)

end_time = time.time()
print(f"Tokenization completed in {end_time - start_time:.2f} seconds")
print(f"Encodings shape: {encodings['input_ids'].shape}")
print(f"Labels shape: {labels.shape}")

# Calculate percentage of GPU memory used
if torch.cuda.is_available():
    torch.cuda.synchronize()
    current_memory = torch.cuda.memory_allocated() / 1e9  # GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # GB
    print(f"GPU memory used: {current_memory:.2f} GB ({current_memory/total_memory*100:.1f}% of total)")

Using device for tokenization: cpu
Processing 100000 samples out of 1600000 total samples
Starting tokenization in batches...
Processing batch 1/98 (1.0%)
Processing 100000 samples out of 1600000 total samples
Starting tokenization in batches...
Processing batch 1/98 (1.0%)
Processing batch 11/98 (11.2%)
Processing batch 11/98 (11.2%)
Processing batch 21/98 (21.4%)
Processing batch 21/98 (21.4%)
Processing batch 31/98 (31.6%)
Processing batch 31/98 (31.6%)
Processing batch 41/98 (41.8%)
Processing batch 41/98 (41.8%)
Processing batch 51/98 (52.0%)
Processing batch 51/98 (52.0%)
Processing batch 61/98 (62.2%)
Processing batch 61/98 (62.2%)
Processing batch 71/98 (72.4%)
Processing batch 71/98 (72.4%)
Processing batch 81/98 (82.7%)
Processing batch 81/98 (82.7%)
Processing batch 91/98 (92.9%)
Processing batch 91/98 (92.9%)
Tokenization completed in 23.96 seconds
Encodings shape: torch.Size([100000, 512])
Labels shape: torch.Size([100000])
Tokenization completed in 23.96 seconds
Encodings

## Step 4: Split the Data

Split the dataset into training and testing sets using PyTorch tensors.

In [33]:
# Split the data
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(encodings['input_ids'], labels, test_size=0.2, random_state=42)

# Move data to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
train_texts, val_texts = train_texts.to(device), val_texts.to(device)
train_labels, val_labels = train_labels.to(device), val_labels.to(device)

# Ensure input tensors are converted to FloatTensor after moving to the device
train_texts = train_texts.to(device).float()
val_texts = val_texts.to(device).float()

## Step 5: Train the Model

Train a PyTorch-based model on the training data.

In [34]:
# Define a simple PyTorch model with a more robust architecture
import torch.nn as nn
import torch.optim as optim
import torch

# Force CPU-only mode to avoid CUDA errors completely
print("WARNING: Due to CUDA issues, we're using CPU-only mode for this training session")
torch.cuda.is_available = lambda: False  # Override CUDA availability check for this session
device = torch.device('cpu')  # Force CPU device

# Define our model architecture
class SentimentModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SentimentModel, self).__init__()
        # More stable architecture with an additional layer
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Debug information before model creation
print(f"Encodings input_ids shape: {encodings['input_ids'].shape}")
print(f"Device: {device} (forced CPU mode)")

# Extract input dimension safely
input_dim = encodings['input_ids'].shape[1]  # Get the input dimension
hidden_dim = 64  # Smaller hidden dimension for stability

# Move data to CPU (if not already there)
train_texts = train_texts.to('cpu')
train_labels = train_labels.to('cpu')

# Create model on CPU
model = SentimentModel(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=2)
print("Model created on CPU successfully")

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert and map labels properly
# The Twitter dataset uses 0 and 4 as sentiment labels, map them to 0 and 1
label_mapping = {0: 0, 4: 1}  # Map Twitter sentiment dataset labels

# Safe label conversion
train_labels_cpu = train_labels.cpu().numpy()
train_labels_mapped = torch.tensor([label_mapping.get(int(label), 0) for label in train_labels_cpu], 
                                  dtype=torch.long)

# Convert input to float for model input
train_texts = train_texts.float()

# Verify label shapes and types
print(f"Train texts shape: {train_texts.shape}, dtype: {train_texts.dtype}")
print(f"Train labels shape: {train_labels_mapped.shape}, dtype: {train_labels_mapped.dtype}")
print(f"Unique label values: {torch.unique(train_labels_mapped)}")

# Reduce batch size for CPU training to avoid running out of memory
subset_size = min(10000, len(train_texts))  # Use at most 10K samples for faster CPU training
train_texts = train_texts[:subset_size]
train_labels_mapped = train_labels_mapped[:subset_size]
print(f"Training on {subset_size} samples (reduced for CPU efficiency)")

# Training loop with proper error handling
epochs = 2  # Reduce epochs for CPU training
batch_size = 64  # Smaller batch size for CPU

# Use mini-batches for CPU training
num_batches = (len(train_texts) + batch_size - 1) // batch_size

for epoch in range(epochs):
    print(f"Starting epoch {epoch+1}/{epochs}...")
    model.train()
    epoch_loss = 0.0
    
    # Mini-batch training
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(train_texts))
        
        batch_texts = train_texts[start_idx:end_idx]
        batch_labels = train_labels_mapped[start_idx:end_idx]
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch_texts)
        
        # Calculate loss
        loss = criterion(outputs, batch_labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Track progress
        epoch_loss += loss.item()
        if i % 10 == 0:
            print(f"  Batch {i+1}/{num_batches}, Batch Loss: {loss.item():.4f}")
    
    # End of epoch stats
    print(f"Epoch {epoch+1}, Avg Loss: {epoch_loss/num_batches:.4f}")

print("Training completed!")

Encodings input_ids shape: torch.Size([100000, 512])
Device: cpu (forced CPU mode)
Model created on CPU successfully
Train texts shape: torch.Size([80000, 512]), dtype: torch.float32
Train labels shape: torch.Size([80000]), dtype: torch.int64
Unique label values: tensor([0, 1])
Training on 10000 samples (reduced for CPU efficiency)
Starting epoch 1/2...
  Batch 1/157, Batch Loss: 171.7150
  Batch 11/157, Batch Loss: 94.5775
  Batch 21/157, Batch Loss: 49.5498
  Batch 31/157, Batch Loss: 40.6684
  Batch 41/157, Batch Loss: 33.8033
  Batch 51/157, Batch Loss: 28.1412
  Batch 61/157, Batch Loss: 33.5857
  Batch 71/157, Batch Loss: 43.8713
  Batch 81/157, Batch Loss: 20.5160
  Batch 91/157, Batch Loss: 22.7430
  Batch 101/157, Batch Loss: 26.9182
  Batch 111/157, Batch Loss: 25.9129
  Batch 121/157, Batch Loss: 25.1427
  Batch 131/157, Batch Loss: 39.3902
  Batch 141/157, Batch Loss: 17.3614
  Batch 151/157, Batch Loss: 27.7556
Epoch 1, Avg Loss: 40.7326
Starting epoch 2/2...
  Batch 1/157

## Step 6: Evaluate the Model

Evaluate the model's performance on the validation data.

In [35]:
# Evaluation loop
model.eval()
with torch.no_grad():
    outputs = model(val_texts)
    predictions = torch.argmax(outputs, dim=1)
    accuracy = (predictions == val_labels).float().mean()
    print(f'Validation Accuracy: {accuracy * 100:.2f}%')

Validation Accuracy: 25.94%


## Step 7: Save the Model

Save the trained model and vectorizer for later use.

In [2]:
# Save the model and vectorizer
from numpy import vectorize


joblib.dump(model, 'model.pkl')
joblib.dump(vectorize, 'vectorizer.pkl')

NameError: name 'joblib' is not defined

# Train a BERT Model for Sentiment Analysis

This notebook demonstrates how to fine-tune a pre-trained BERT model for sentiment analysis using Hugging Face Transformers.

## Step 1: Install Required Libraries

Install the necessary libraries, including transformers and torch.

In [None]:
# Install required libraries
%pip install transformers torch scikit-learn pandas

In [None]:
# Install PyTorch with CUDA support
%pip install torch torchvision --upgrade --index-url https://download.pytorch.org/whl/cu118

# Verify installation and GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"CUDA capability: {torch.cuda.get_device_capability(0)}")
    current_memory = torch.cuda.memory_allocated(0) / 1e9  # GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # GB
    print(f"GPU memory: {current_memory:.2f}GB used / {total_memory:.2f}GB total")

## Step 2: Import Libraries

Import the necessary libraries for data handling, model training, and evaluation.

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import torch

## Step 3: Load and Preprocess the Dataset

Load the dataset in chunks if it is too large to fit into memory. Sample the dataset for faster training.

In [None]:
# Load the dataset in chunks
chunk_size = 10000  # Number of rows per chunk
chunks = []
for chunk in pd.read_csv('backend/training.1600000.processed.noemoticon.csv', chunksize=chunk_size, encoding='latin-1'):
    chunks.append(chunk.sample(frac=0.1, random_state=42))  # Sample 10% of each chunk

# Combine sampled chunks into a single DataFrame
data = pd.concat(chunks, ignore_index=True)
print(f'Dataset size after sampling: {data.shape[0]} rows')

In [None]:
# Ensure dataset columns are renamed
if 'text' not in data.columns:
	data.columns = ['label', 'id', 'date', 'query', 'user', 'text']

# Define train_texts and val_texts
train_texts, val_texts, train_labels, val_labels = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [None]:
# Inspect dataset columns to verify column names
print(data.columns)

In [None]:

# Print the first few rows of the dataset to inspect its structure
print(data.head())

In [None]:
# Rename dataset columns to match expected names
data.columns = ['label', 'id', 'date', 'query', 'user', 'text']

In [None]:
# Verify dataset structure
print(data.head())
print(data.columns)

## Step 4: Tokenize the Dataset

Use the BERT tokenizer to preprocess the text data.

In [None]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)

## Step 5: Create a Dataset Class

Define a custom dataset class to handle the tokenized data.

In [None]:
# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)

# Define Dataset class
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

## Step 6: Load the Pre-trained BERT Model

Load a pre-trained BERT model for sequence classification.

In [None]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

## Step 7: Define Training Arguments

Set up the training arguments for fine-tuning the model.

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    # Removed evaluation_strategy as it is not supported in the current version
)

In [None]:
import transformers
import accelerate
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

## Step 8: Train the Model

Use the Trainer API to fine-tune the BERT model.

In [None]:
# Make sure device is defined
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

# Map labels to the range [0, 1] (Twitter sentiment dataset uses 0 for negative, 4 for positive)
label_mapping = {0: 0, 4: 1}

# Ensure all labels are valid before mapping
train_labels_mapped = torch.tensor([label_mapping.get(label.item(), 0) for label in train_labels], dtype=torch.long)
val_labels_mapped = torch.tensor([label_mapping.get(label.item(), 0) for label in val_labels], dtype=torch.long)

# Verify label mapping
print(f'Unique train labels after mapping: {set(train_labels_mapped.tolist())}')
print(f'Unique validation labels after mapping: {set(val_labels_mapped.tolist())}')

# Update train_dataset and val_dataset with the mapped labels
train_dataset = SentimentDataset(train_encodings, train_labels_mapped)
val_dataset = SentimentDataset(val_encodings, val_labels_mapped)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
try:
    trainer.train()
except Exception as e:
    print(f"An error occurred during training: {e}")
    print("Debugging information:")
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")
    print(f"Train encodings keys: {train_encodings.keys()}")
    print(f"Validation encodings keys: {val_encodings.keys()}")
    
    # Safely check model properties
    if hasattr(model, 'config'):
        print(f"Model hidden size: {model.config.hidden_size}")
        print(f"Model num labels: {model.config.num_labels}")
    else:
        print("Model doesn't have a config attribute")
        
    # Check a sample input-output
    print("Testing a sample input through the model:")
    sample_input = {k: torch.tensor(v[:1]) for k, v in train_encodings.items()}
    # Move tensors to device
    sample_input = {k: v.to(device) for k, v in sample_input.items()}
    try:
        sample_output = model(**sample_input)
        print(f"Sample output shape: {sample_output.logits.shape if hasattr(sample_output, 'logits') else 'No logits'}")
    except Exception as sample_error:
        print(f"Error with sample: {sample_error}")

## Step 9: Save the Model

Save the fine-tuned model and tokenizer for later use.

In [None]:
# Check which model is being used
if hasattr(model, 'save_pretrained'):
	# Save Hugging Face model and tokenizer
	model.save_pretrained('./bert_model')
	tokenizer.save_pretrained('./bert_model')
	print("BERT model and tokenizer saved successfully!")
else:
	# Save PyTorch model using torch.save
	torch.save(model.state_dict(), './pytorch_model.pt')
	print("PyTorch model saved successfully!")

## Debugging: Inspect Shapes and Values

Add debugging statements to inspect the shapes and values of logits and labels during training.

In [None]:
# Debugging: Inspect logits and labels
def compute_loss_with_debugging(model, inputs):
    outputs = model(**inputs)
    logits = outputs.logits
    labels = inputs['labels']
    print(f'Logits shape: {logits.shape}')
    print(f'Labels shape: {labels.shape}')
    print(f'Unique labels: {torch.unique(labels)}')
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
    return loss

# Replace the Trainer's compute_loss method
trainer.compute_loss = compute_loss_with_debugging

## Preprocessing: Ensure Labels are Correctly Formatted

Map labels to integers and ensure they are in the range [0, num_labels-1].

In [None]:
# Map labels to integers - since train_labels and val_labels are already tensors
# Based on previous cells, the labels are already in the form of 0 and 4
# We need to map 0 -> 0 (negative) and 4 -> 1 (positive)
label_mapping = {0: 0, 4: 1}  # Map Twitter sentiment dataset labels

# Create mapped versions rather than overwriting
train_labels_mapped = torch.tensor([label_mapping.get(label.item(), 0) for label in train_labels], dtype=torch.long)
val_labels_mapped = torch.tensor([label_mapping.get(label.item(), 0) for label in val_labels], dtype=torch.long)

# Verify label mapping
print(f'Unique train labels before mapping: {torch.unique(train_labels).tolist()}')
print(f'Unique train labels after mapping: {torch.unique(train_labels_mapped).tolist()}')
print(f'Unique validation labels before mapping: {torch.unique(val_labels).tolist()}')
print(f'Unique validation labels after mapping: {torch.unique(val_labels_mapped).tolist()}')

## Inspect Dataset and Model Configuration

Verify the dataset structure and model configuration.

In [None]:
# Inspect dataset structure
print(data.head())
print(data.columns)

# Verify model configuration
if hasattr(model, 'config'):
	# For Hugging Face models (BertForSequenceClassification)
	print(f'Model num_labels: {model.config.num_labels}')
	if hasattr(model.config, 'problem_type'):
		print(f'Model problem type: {model.config.problem_type}')
else:
	# For custom PyTorch model (SentimentModel)
	print(f'Model type: {type(model).__name__}')
	if isinstance(model, SentimentModel):
		print(f'Input dimension: {model.fc.in_features}')
		print(f'Output dimension: {model.fc.out_features}')

In [None]:
# Check for GPU availability
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

In [None]:
# Move data and model to GPU
train_texts, val_texts = train_texts.to(device), val_texts.to(device)
train_labels, val_labels = train_labels.to(device), val_labels.to(device)
model = model.to(device)

In [None]:
# Simple GPU benchmark to verify GPU performance
import torch
import time

# Check CUDA availability
print(f"CUDA is available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    
    # Run a simple benchmark
    print("\nRunning GPU benchmark...")
    
    # Create large tensors
    start_time = time.time()
    size = 5000
    x = torch.randn(size, size, device='cuda')
    y = torch.randn(size, size, device='cuda')
    
    # Matrix multiplication (computationally intensive)
    torch.cuda.synchronize()  # Wait for all kernels to finish
    matmul_start = time.time()
    z = torch.matmul(x, y)
    torch.cuda.synchronize()  # Wait for matmul to finish
    matmul_time = time.time() - matmul_start
    
    # Free memory
    del x, y, z
    torch.cuda.empty_cache()
    
    print(f"Created two {size}x{size} random tensors and multiplied them")
    print(f"Matrix multiplication time: {matmul_time:.4f} seconds")
    print(f"Total time including tensor creation: {time.time() - start_time:.4f} seconds")
    
    # Show memory usage
    allocated = torch.cuda.memory_allocated() / 1e9  # Convert to GB
    reserved = torch.cuda.memory_reserved() / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved, {total:.2f}GB total")
else:
    print("No GPU available, running on CPU only.")