In [13]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from typing import List

from torchtext.vocab import GloVe

# Test verisini okuma
test_df = pd.read_parquet('test-00000-of-00001.parquet')

# Train verisini okuma
train_df = pd.read_parquet('train-00000-of-00001.parquet')


In [4]:

print(train_df.tail())
print(test_df.tail())

                                                     text  label
119995  Pakistan's Musharraf Says Won't Quit as Army C...      0
119996  Renteria signing a top-shelf deal Red Sox gene...      1
119997  Saban not going to Dolphins yet The Miami Dolp...      1
119998  Today's NFL games PITTSBURGH at NY GIANTS Time...      1
119999  Nets get Carter from Raptors INDIANAPOLIS -- A...      1
                                                   text  label
7595  Around the world Ukrainian presidential candid...      0
7596  Void is filled with Clement With the supply of...      1
7597  Martinez leaves bitter Like Roger Clemens did ...      1
7598  5 of arthritis patients in Singapore take Bext...      2
7599  EBay gets into rentals EBay plans to buy the a...      2


In [5]:
# Eğitim seti
X_train = train_df['text']
y_train = train_df['label']

# Test seti
X_test = test_df['text']
y_test = test_df['label']


In [10]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize(texts, tokenizer, max_length=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

train_tokens = tokenize(X_train, tokenizer)
test_tokens = tokenize(X_test, tokenizer)


In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokens.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Assuming train_tokens, test_tokens, y_train, and y_test are defined
train_dataset = TextDataset(train_tokens, y_train.values)
test_dataset = TextDataset(test_tokens, y_test.values)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [18]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, num_classes, pretrained_embeddings=None, freeze_embeddings=True):
        super(TextCNN, self).__init__()
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=freeze_embeddings)
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (size, embed_dim)) for size in filter_sizes
        ])
        
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)
        
    def forward(self, x):
        x = self.embedding(x)  # x shape: (batch_size, max_seq_length, embed_dim)
        x = x.unsqueeze(1)  # x shape: (batch_size, 1, max_seq_length, embed_dim)
        
        # Apply convolutional filters
        conv_outputs = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # Apply each convolutional layer
        pooled_outputs = [F.max_pool1d(conv_output, conv_output.size(2)).squeeze(2) for conv_output in conv_outputs]  # Max pooling over time
        
        # Concatenate pooled features
        x = torch.cat(pooled_outputs, 1)
        
        # Fully connected layer
        x = self.fc(x)
        
        return x

glove = GloVe(name='6B', dim=300)
vocab_size = len(glove.stoi)
embed_dim = glove.vectors.size(1)  # Assuming you use GloVe embeddings of dimension 300
num_filters = 100
filter_sizes = [3, 4, 5]
num_classes = 4  # Number of classes in your classification task
model = TextCNN(vocab_size, embed_dim, num_filters, filter_sizes, num_classes, pretrained_embeddings=glove.vectors)

BadZipFile: File is not a zip file

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        inputs = batch['input_ids']
        labels = batch['labels']
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}')


In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        inputs = batch['input_ids']
        labels = batch['labels']
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f'Accuracy: {100 * correct / total}%')
