# Sentiment Analysis Training (CPU-only)

This notebook provides a clean implementation of sentiment analysis training using CPU only, avoiding any CUDA-related issues.

In [None]:
# Import necessary libraries
%pip install pandas numpy torch scikit-learn joblib
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, DataLoader
import time
import joblib

## Force CPU Mode

First, we'll force PyTorch to use CPU only to avoid any CUDA errors.

In [None]:
# Disable CUDA usage completely
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
torch.cuda.is_available = lambda: False  # Override cuda availability check

# Verify we're using CPU
device = torch.device("cpu")
print(f"Using device: {device}")

## Load & Preprocess Data

We'll load the Twitter sentiment dataset and prepare it for training.

In [None]:
# Load a sample of the dataset
print("Loading dataset...")
try:
    # Load just a sample of the dataset for faster processing
    chunk_size = 10000
    chunks = []
    for i, chunk in enumerate(pd.read_csv('backend/training.1600000.processed.noemoticon.csv', 
                                         encoding='latin-1', 
                                         header=None,
                                         chunksize=chunk_size)):
        if i >= 10:  # Take only 10 chunks = 100K samples
            break
        chunks.append(chunk)
    data = pd.concat(chunks, ignore_index=True)
    
    # Assign column names
    data.columns = ['label', 'id', 'date', 'query', 'user', 'text']
    print(f"Loaded {len(data)} samples")
except Exception as e:
    print(f"Error loading dataset: {e}")
    data = None

In [None]:
# Examine the dataset
print("Dataset overview:")
print(data.head())

# Check for class balance
print("\nClass distribution:")
print(data['label'].value_counts())

## Feature Extraction with CountVectorizer

We'll use sklearn's CountVectorizer for simpler and faster feature extraction instead of transformer-based models.

In [None]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], 
    data['label'], 
    test_size=0.2, 
    random_state=42
)

# Map the Twitter sentiment labels (0=negative, 4=positive) to binary (0, 1)
y_train = y_train.map({0: 0, 4: 1})
y_test = y_test.map({0: 0, 4: 1})

# Convert features using CountVectorizer
print("Extracting features using CountVectorizer...")
start_time = time.time()

# Use a limited vocabulary size for efficiency
max_features = 10000
vectorizer = CountVectorizer(max_features=max_features)

# Fit and transform training data
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Convert to PyTorch tensors (on CPU)
X_train_tensor = torch.tensor(X_train_counts.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_counts.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

end_time = time.time()
print(f"Feature extraction completed in {end_time - start_time:.2f} seconds")
print(f"Training features shape: {X_train_tensor.shape}")
print(f"Testing features shape: {X_test_tensor.shape}")

## Create PyTorch Datasets and DataLoaders

We'll use PyTorch's Dataset and DataLoader for efficient batching.

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Create datasets
train_dataset = SentimentDataset(X_train_tensor, y_train_tensor)
test_dataset = SentimentDataset(X_test_tensor, y_test_tensor)

# Create dataloaders
batch_size = 64  # Small batch size for CPU
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)

## Define the Model

We'll create a simple neural network for sentiment classification.

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2, dropout_rate=0.2):
        super(SentimentClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim // 2, output_dim)
        )
        
    def forward(self, x):
        return self.model(x)

# Create the model
input_dim = X_train_tensor.shape[1]  # Number of features
model = SentimentClassifier(input_dim=input_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Training Function

Define a function to handle the training process.

In [None]:
def train(model, train_loader, criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        epoch_start = time.time()
        
        for batch_idx, (features, labels) in enumerate(train_loader):
            # Clear gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(features)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            epoch_loss += loss.item()
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            # Print progress every 50 batches
            if batch_idx % 50 == 0:
                print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item():.4f}")
        
        # End of epoch stats
        epoch_end = time.time()
        print(f"Epoch {epoch+1}/{epochs} completed in {epoch_end - epoch_start:.2f}s, Avg loss: {epoch_loss/len(train_loader):.4f}")
        
        # Evaluate after each epoch
        evaluate(model, test_loader, criterion)

## Evaluation Function

Define a function to evaluate the model's performance.

In [None]:
def evaluate(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for features, labels in test_loader:
            outputs = model(features)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            
            _, predicted = torch.max(outputs, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / total
    avg_loss = test_loss / len(test_loader)
    print(f"Test set: Average loss: {avg_loss:.4f}, Accuracy: {correct}/{total} ({accuracy*100:.2f}%)")
    return accuracy

## Train the Model

Now let's train the model using our defined functions.

In [None]:
# Train the model
print("Starting training...")
start_time = time.time()

try:
    train(model, train_loader, criterion, optimizer, epochs=3)
    print("Training completed successfully!")
except Exception as e:
    print(f"Error during training: {e}")

end_time = time.time()
print(f"Total training time: {end_time - start_time:.2f} seconds")

## Save the Model

Save the trained model for later use in your application.

In [None]:
# Save the model and vectorizer
try:
    # Save PyTorch model
    torch.save(model.state_dict(), 'sentiment_model.pt')
    
    # Save the vectorizer
    joblib.dump(vectorizer, 'vectorizer.pkl')
    
    # Create a simple model wrapper for easy inference
    class SentimentAnalysisModel:
        def __init__(self, model, vectorizer):
            self.model = model
            self.vectorizer = vectorizer
        
        def predict(self, texts):
            # Convert to list if single string
            if isinstance(texts, str):
                texts = [texts]
            
            # Transform texts to features
            features = self.vectorizer.transform(texts)
            features_tensor = torch.tensor(features.toarray(), dtype=torch.float32)
            
            # Get predictions
            self.model.eval()
            with torch.no_grad():
                outputs = self.model(features_tensor)
                _, predictions = torch.max(outputs, dim=1)
            
            # Convert predictions to list
            return predictions.numpy().tolist()
    
    # Create and save the complete model
    complete_model = SentimentAnalysisModel(model, vectorizer)
    joblib.dump(complete_model, 'model.pkl')
    
    print("Model saved successfully!")
except Exception as e:
    print(f"Error saving model: {e}")

## Test Model Inference

Let's test the saved model with some example texts.

In [None]:
# Test the model with some examples
example_texts = [
    "I absolutely love this movie, it's amazing!",
    "This product is terrible, I want my money back",
    "The service was okay, nothing special",
    "I'm having a great day today!",
    "This is the worst experience I've ever had"
]

try:
    # Load the complete model
    loaded_model = joblib.load('model.pkl')
    
    # Make predictions
    predictions = loaded_model.predict(example_texts)
    
    # Map predictions to sentiment labels
    sentiment_labels = ["Negative" if pred == 0 else "Positive" for pred in predictions]
    
    # Print results
    print("Example sentiment predictions:")
    for text, sentiment in zip(example_texts, sentiment_labels):
        print(f"Text: '{text}'")
        print(f"Sentiment: {sentiment}\n")
except Exception as e:
    print(f"Error testing model: {e}")

## Integration with Flask Backend

The model saved as `model.pkl` can be loaded by the Flask backend (app.py) to provide sentiment analysis services. The backend is already configured to use this model.

In [None]:
print("Your sentiment analysis model is ready to be used by the Flask backend.")
print("The backend can now provide both sentiment analysis and text summarization services.")