In [2]:
import re
import string
import pandas as pd
import numpy as np
import torch 
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from sklearn.model_selection import train_test_split

# Load the data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# train_df['keyword'].unique()
# we're going to drop the location and id because I dont really think its necessary

train_df = train_df.drop(columns=["id", "location"])

# Drop any rows with missing values in the 'text' or 'target' columns
train_df = train_df.dropna(subset=['text', 'target'])

In [4]:
# functions for cleaning the text
# clean the urls

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

def remove_punct(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# class for converting pandas dataframes to pytorch tensors
class TwitterDataset(Dataset):
    def __init__(self, texts, targets=None, vocab=None, max_len=100):
        # Clean text data
        self.texts = [remove_punct(remove_URL(text)) for text in texts]
        self.targets = targets if targets is not None else None
        self.max_len = max_len

        # Build or assign vocabulary
        if vocab is None:
            self.vocab = self.build_vocab(self.texts)
        else:
            self.vocab = vocab

    def build_vocab(self, texts):
        counter = Counter()
        for text in texts:
            counter.update(text.split())
        return {word: idx + 1 for idx, (word, _) in enumerate(counter.most_common())}  # Reserve 0 for padding

    def text_to_sequence(self, text):
        return [self.vocab.get(word, 0) for word in text.split()]

    def pad_sequence(self, sequence):
        if len(sequence) > self.max_len:
            return sequence[:self.max_len]
        else:
            return sequence + [0] * (self.max_len - len(sequence))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        sequence = self.text_to_sequence(text)
        sequence = self.pad_sequence(sequence)
        sequence = torch.tensor(sequence, dtype=torch.long)
        
        if self.targets is not None:
            label = torch.tensor(self.targets[idx], dtype=torch.long)
            return sequence, label
        else:
            return sequence


In [5]:
X = train_df["text"].values
y = train_df["target"].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# create the dataset and data instances
train_dataset = TwitterDataset(X_train, y_train)
val_dataset = TwitterDataset(X_val, y_val, vocab=train_dataset.vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=32)
val_loader = DataLoader(val_dataset, batch_size=32)

sample_batch = next(iter(train_loader))
print("Sample text tensor:", sample_batch[0][:2])  # Text sequences
print("Sample labels:", sample_batch[1][:2])       # Corresponding labels

Sample text tensor: tensor([[1567,    1,    6, 1731, 1568,  796,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [ 715,  716,  549, 1837, 5246, 1495, 3026, 5247, 9399,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0

In [8]:
import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

vocab_size = len(train_dataset.vocab) + 1  # +1 for padding index 0
embed_size = 128
hidden_size = 128
num_classes = 2  # Binary classification (0 or 1)

model = TextClassifier(vocab_size, embed_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


Using device: cuda


In [9]:
# Training parameters
num_epochs = 5

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for texts, labels in train_loader:
        # Move data to the GPU
        texts = texts.to(device)
        labels = labels.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # Print training loss after each epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")
    
    # Validation loop
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for texts, labels in val_loader:
            # Move validation data to the GPU
            texts = texts.to(device)
            labels = labels.to(device)
            
            outputs = model(texts)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f"Validation Accuracy: {accuracy:.2f}%")


Epoch [1/5], Loss: 0.6870
Validation Accuracy: 58.17%
Epoch [2/5], Loss: 0.6849
Validation Accuracy: 58.17%
Epoch [3/5], Loss: 0.6846
Validation Accuracy: 58.17%
Epoch [4/5], Loss: 0.6840
Validation Accuracy: 58.17%
Epoch [5/5], Loss: 0.6840
Validation Accuracy: 58.17%
