In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torchtext
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import spacy

def load_data(file_path):
    data = pd.read_csv(file_path, usecols=['B', 'D'], names=['label', 'text'])
    data = data.dropna(subset=['label'])
    return data


def split_data(data, train_file, test_file):
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_data.to_csv(train_file, index=False)
    test_data.to_csv(test_file, index=False)

data = load_data('your_data.csv')
split_data(data, 'train.csv', 'test.csv')

spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = Field(tokenize=tokenizer, lower=True, include_lengths=True)
LABEL = Field(sequential=False, use_vocab=False)

fields = [('text', TEXT), ('label', LABEL)]

train_data, test_data = TabularDataset.splits(
    path='', train='train.csv', test='test.csv', format='csv', fields=fields)

TEXT.build_vocab(train_data, max_size=10000, min_freq=2)
LABEL.build_vocab(train_data)

train_loader, test_loader = BucketIterator.splits(
    (train_data, test_data), batch_size=64, sort_key=lambda x: len(x.text), sort_within_batch=True)

class Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(Classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), batch_first=True)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        hidden = self.dropout(hidden[-1, :, :])
        return self.fc(hidden)

def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

model = Classifier(len(TEXT.vocab), 100, 256, len(LABEL.vocab), 2, 0.5)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

num_epochs = 10
best_valid_loss = float('inf')

for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    valid_loss = evaluate(model, test_loader, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pt')

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f}')



In [None]:
pip install torch torchvision -f https://download.pytorch.org/whl/cu111/torch_stable.html
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Classifier(len(TEXT.vocab), 100, 256, len(LABEL.vocab), 2, 0.5).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device)
text, text_lengths = batch.text
text, text_lengths = text.to(device), text_lengths.to(device)
predictions = model(text, text_lengths).squeeze(1)
loss = criterion(predictions, batch.label.to(device))
