# Step 1: Import libraries 

In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
from wordcloud import WordCloud
import seaborn as sns

In [96]:
def setup_environment():
    """Initialize environment settings and check GPU"""
    torch.manual_seed(42)
    np.random.seed(42)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Using device: {device}')
    return device

In [97]:
# ## 2. Data Loading
def load_data():
    """Load and combine datasets from Kaggle paths"""
    # Load individual datasets
    education = pd.read_csv("/kaggle/input/sentiment-analysis-evaluation-dataset/Education.csv")
    finance = pd.read_csv("/kaggle/input/sentiment-analysis-evaluation-dataset/Finance.csv")
    politics = pd.read_csv("/kaggle/input/sentiment-analysis-evaluation-dataset/Politics.csv")
    sports = pd.read_csv("/kaggle/input/sentiment-analysis-evaluation-dataset/Sports.csv")
    
    # Create source labels
    education['source'] = 'Education'
    finance['source'] = 'Finance'
    politics['source'] = 'Politics'
    sports['source'] = 'Sports'
    
    # Combine all dataframes
    df = pd.concat([education, finance, politics, sports], ignore_index=True)
    
    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return df

# Execute data loading
df = load_data()
print("Dataset loaded successfully!")
print("\nFirst few rows:")
display(df.head())
print("\nDataset Info:")
display(df.info())

# Display category distribution
print("\nSamples per category:")
display(df['source'].value_counts())
print("Available columns in the dataframe:")
print(df.columns.tolist())

Dataset loaded successfully!

First few rows:


Unnamed: 0,Text,Label,source
0,Education policies should be informed by resea...,positive,Education
1,Sports endorsements by celebrities and athlete...,positive,Sports
2,The pursuit of short-term profits can lead to ...,negative,Finance
3,Sports broadcasting has become saturated with ...,negative,Sports
4,Credit rating agencies are susceptible to conf...,negative,Finance



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    209 non-null    object
 1   Label   209 non-null    object
 2   source  209 non-null    object
dtypes: object(3)
memory usage: 5.0+ KB


None


Samples per category:


source
Sports       56
Politics     53
Education    52
Finance      48
Name: count, dtype: int64

Available columns in the dataframe:
['Text', 'Label', 'source']


In [98]:
# ## 3. Exploratory Data Analysis

def plot_sentiment_distributions(df):
    """Plot overall and category-wise sentiment distributions."""
    # Overall distribution
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, x='Label')
    plt.title('Distribution of Sentiments Across All Categories')
    plt.show()


def create_word_clouds(df):
    """Create word clouds for each sentiment category."""
    plt.figure(figsize=(15, 5))
    
    for idx, sentiment in enumerate(['positive', 'negative', 'neutral']):
        plt.subplot(1, 3, idx + 1)
        text = ' '.join(df[df['Label'] == sentiment]['Text'])  # Updated column names
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'{sentiment.capitalize()} Sentiment Word Cloud')
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()

In [101]:
# ## 4. Data Preparation

# %%
class SentimentDataset(Dataset):
    """Custom Dataset class for BERT input"""
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare data for modeling
sentiment_map = {'positive': 0, 'neutral': 1, 'negative': 2}
df['label'] = df['positive', 'negative', 'neutral'].map(sentiment_map)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].values, df['label'].values, 
    test_size=0.2, random_state=42
)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    num_labels=3
).to(device)

# Create datasets
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

print("Data preparation completed!")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

KeyError: ('positive', 'negative', 'neutral')

In [None]:
# ## 5. Model Training

# %%
def train_model(model, train_loader, val_loader, device, epochs=3):
    """Train the model and return training history"""
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    train_losses = []
    val_losses = []
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        
        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        # Validation phase
        model.eval()
        total_val_loss = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                total_val_loss += outputs.loss.item()
        
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        
        print(f'Epoch {epoch + 1}:')
        print(f'Average training loss: {avg_train_loss:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}')
    
    return train_losses, val_losses

# Execute training
train_losses, val_losses = train_model(model, train_loader, val_loader, device)

# %%
def plot_training_history(train_losses, val_losses):
    """Plot training and validation loss curves"""
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# Plot training history
plot_training_history(train_losses, val_losses)

In [None]:
# ## 6. Model Evaluation

# %%
def evaluate_model(model, val_loader, device):
    """Evaluate model performance"""
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return all_preds, all_labels

# Execute evaluation
predictions, true_labels = evaluate_model(model, val_loader, device)

# %%
def print_classification_metrics(true_labels, predictions):
    """Print classification report"""
    print("Classification Report:")
    print(classification_report(true_labels, predictions, 
                              target_names=['Positive', 'Neutral', 'Negative']))

# Print metrics
print_classification_metrics(true_labels, predictions)

# %%
def plot_confusion_matrix(true_labels, predictions):
    """Plot confusion matrix"""
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(true_labels, predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Positive', 'Neutral', 'Negative'],
                yticklabels=['Positive', 'Neutral', 'Negative'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Plot confusion matrix
plot_confusion_matrix(true_labels, predictions)

In [None]:
# ## 7. Save Model and Make Predictions

# %%
# Save the model
model_save_path = 'sentiment_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

# %%
def predict_sentiment(text, model, tokenizer, device):
    """Predict sentiment for a given text"""
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=1)
    
    sentiment_map = {0: 'positive', 1: 'neutral', 2: 'negative'}
    return sentiment_map[pred.item()]

# Test the prediction function
example_texts = [
    "This product exceeded my expectations!",
    "The service was okay, nothing special.",
    "I'm very disappointed with this purchase."
]

print("Example predictions:")
for text in example_texts:
    prediction = predict_sentiment(text, model, tokenizer, device)
    print(f"\nText: {text}")
    print(f"Predicted sentiment: {prediction}")