In [1]:
# Imports
import os
import sqlite3
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.optim import AdamW
from tqdm import tqdm

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Create and Populate SQLite Database
def create_and_populate_database(csv_file_path, db_name):
    # Connect to SQLite
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # Drop and create table
    cursor.execute("DROP TABLE IF EXISTS tweets")
    cursor.execute('''
        CREATE TABLE tweets (
            Sentiment INTEGER,
            Id BIGINT,
            Date TEXT,
            Flag TEXT,
            User TEXT,
            Tweet TEXT
        )
    ''')
    conn.commit()
    print("tweets table has been reset.")

    # Load CSV data in chunks
    chunk_size = 10000
    for chunk in pd.read_csv(
        csv_file_path,
        delimiter=',',
        chunksize=chunk_size,
        encoding='ISO-8859-1',
        names=['Sentiment', 'Id', 'Date', 'Flag', 'User', 'Tweet'],
        header=0,
        on_bad_lines='skip'
    ):
        # Normalize column names
        chunk.columns = [col.lower().strip() for col in chunk.columns]

        # Insert data into database
        chunk.to_sql('tweets', conn, if_exists='append', index=False)

    print("Data successfully loaded into tweets.db!")
    conn.close()


In [None]:
# Data Retrieval
def get_data_from_database(db_name='tweets.db', sample_size=100000):
    conn = sqlite3.connect(db_name)
    query = "SELECT tweet, sentiment FROM tweets"
    data = pd.read_sql_query(query, conn)
    conn.close()

    # Sample the data
    data = data.sample(n=sample_size, random_state=42)
    return data

In [None]:
# Check data in the database
db_path = '/content/drive/MyDrive/tweets.db'
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("SELECT * FROM tweets LIMIT 10", conn)
print(df)

In [None]:
# Data Preprocessing
def data_process(data, tokenizer, max_len=128):
    input_ids, attention_masks, labels = [], [], []

    for _, row in data.iterrows():
        encoded = tokenizer.encode_plus(
            text=row['Tweet'],
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(0 if row['Sentiment'] == 0 else 1)

    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(labels)

In [None]:
# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.labels[idx]

In [None]:
# BERT-LSTM Model Definition
class BERT_LSTM(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', hidden_size=128, num_classes=2):
        super(BERT_LSTM, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_output, _ = self.lstm(bert_output.last_hidden_state)
        pooled_output = lstm_output[:, -1, :]
        dropout_output = self.dropout(pooled_output)
        logits = self.fc(dropout_output)
        return self.softmax(logits)

In [None]:
# Save model Checkpoints
def save_checkpoint(model, optimizer, epoch, checkpoint_path="checkpoint.pth"):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch
    }
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved at epoch {epoch + 1}.")


In [None]:
# Load Model Checkpoint
def load_checkpoint(model, optimizer, checkpoint_path="checkpoint.pth"):
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        print(f"Checkpoint loaded. Resuming from epoch {epoch + 1}.")
        return model, optimizer, epoch
    return model, optimizer, -1

In [None]:
# Training Function
def train_model(model, dataloader, optimizer, criterion, device, epoch, total_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{total_epochs}", leave=False)
    for input_ids, attention_masks, labels in progress_bar:
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
        optimizer.zero_grad()

        outputs = model(input_ids, attention_masks)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        progress_bar.set_postfix({'Loss': f"{total_loss / (total + 1):.4f}", 'Accuracy': f"{correct / total:.4f}"})

    return total_loss / len(dataloader), correct / total

In [None]:
# Evaluation Function
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for input_ids, attention_masks, labels in dataloader:
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

            outputs = model(input_ids, attention_masks)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    report = classification_report(all_labels, all_preds, output_dict=True)
    return total_loss / len(dataloader), correct / total, report


In [None]:
    # Step 1: Create and populate database
    create_and_populate_database(CSV_FILE_PATH, DB_NAME)

In [None]:
# Main Function
if __name__ == "__main__":
    # Configurations
    DB_NAME = '/content/drive/MyDrive/tweets.db'
    BERT_MODEL_NAME = 'bert-base-uncased'
    MAX_LEN = 128
    BATCH_SIZE = 32
    SAMPLE_SIZE = 100000
    EPOCHS = 3
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    CHECKPOINT_PATH = "checkpoint.pth"

    # Step 1: Retrieve and preprocess the data
    data = get_data_from_database(DB_NAME, sample_size=SAMPLE_SIZE)
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
    input_ids, attention_masks, labels = data_process(data, tokenizer, MAX_LEN)

    # Step 2: Split data into train, validation, and test sets
    train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
        input_ids, attention_masks, labels, test_size=0.2, random_state=42
    )

    train_dataset = SentimentDataset(train_inputs, train_masks, train_labels)
    val_dataset = SentimentDataset(val_inputs,+ val_masks, val_labels)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Step 3: Define the model, optimizer, and loss function
    model = BERT_LSTM(BERT_MODEL_NAME).to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    # Step 4: Load checkpoint if exists
    model, optimizer, start_epoch = load_checkpoint(model, optimizer, CHECKPOINT_PATH)

    # Step 5: Train the model
    for epoch in range(start_epoch + 1, EPOCHS):
        train_loss, train_acc = train_model(model, train_loader, optimizer, criterion, DEVICE, epoch, EPOCHS)
        val_loss, val_acc, val_report = evaluate_model(model, val_loader, criterion, DEVICE)
        print(f"\nEpoch {epoch + 1}/{EPOCHS}")
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        # Save checkpoint at the end of each epoch
        save_checkpoint(model, optimizer, epoch, CHECKPOINT_PATH)

    # Save final model
    torch.save(model.state_dict(), "bert_lstm_sentiment_final.pth")
    print("Final model saved.")