In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

In [49]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
class DataPreprocessing():
    def __init__(self, train_data_path, test_data_path, text_col, label_col):
        self.train_data_path = train_data_path
        self.test_data_path = test_data_path
        self.text_col = text_col
        self.label_col = label_col
        self.train_label = None
        self.test_label = None
        
    def read_data(self):
        self.train_data = pd.read_csv(self.train_data_path)
        self.train_label = self.train_data['label'].values
        
        self.test_data = pd.read_csv(self.test_data_path)
        self.test_label = self.test_data['label'].values
        
    def drop_id_column(self):
        if 'id' in self.train_data.columns:
            self.train_data.drop(columns=['id'], inplace=True)
        if 'id' in self.test_data.columns:
            self.test_data.drop(columns=['id'], inplace=True)

    def replace_turkish_chars(self, text):
        turkish_chars = {
            'ç': 'c',
            'Ç': 'C',
            'ğ': 'g',
            'Ğ': 'G',
            'ı': 'i',
            'İ': 'I',
            'ö': 'o',
            'Ö': 'O',
            'ş': 's',
            'Ş': 'S',
            'ü': 'u',
            'Ü': 'U'
        }
        for turkish_char, english_char in turkish_chars.items():
            text = text.replace(turkish_char, english_char)
        return text
        
    def clean_tweet(self, text):
        # Remove all URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove all mentions (@User) and Hashtags (#hashtag) with text
        text = re.sub(r'@\w+', '', text)  # Remove words that start with the '@' symbol.
        text = re.sub(r'#\w+', '', text)  # Remove words that start with the '#' symbol.
        # Unnecessary punctuation marks.
        text = re.sub(r'[^a-zA-Z0-9çÇğĞıİöÖşŞüÜ\s.,;:!\'^\?]', '', text)
        # Remove extra space.
        text = re.sub(r'\s+', ' ', text).strip()
        # Convert lowercase.
        text = text.lower()
        # Replace Turkish Characters.
        text = self.replace_turkish_chars(text)
        return text
        
    def apply_clean_tweet(self):
        """
        # Remove NaN values
        self.train_data.dropna(subset=[self.text_col], inplace=True)
        self.test_data.dropna(subset=[self.text_col], inplace=True)
        """
        self.train_data['cleaned_text'] = self.train_data[self.text_col].apply(self.clean_tweet)
        self.test_data['cleaned_text'] = self.test_data[self.text_col].apply(self.clean_tweet)

    def tokenization(self):
        self.train_data['token_text'] = self.train_data['cleaned_text'].apply(lambda x: word_tokenize(x))
        self.test_data['token_text'] = self.test_data['cleaned_text'].apply(lambda x: word_tokenize(x))
    
    def remove_stopwords(self):
        stopWords = set(stopwords.words('turkish'))
        self.train_data['tokens_no_stopwords'] = self.train_data['token_text'].apply(lambda x: [word for word in x if word.lower() not in stopWords])
        self.test_data['tokens_no_stopwords'] = self.test_data['token_text'].apply(lambda x: [word for word in x if word.lower() not in stopWords])

    def join_column(self):
        self.train_data['joined_token_no_stopwords'] = self.train_data['tokens_no_stopwords'].apply(lambda x: ' '.join(x))
        self.test_data['joined_token_no_stopwords'] = self.test_data['tokens_no_stopwords'].apply(lambda x: ' '.join(x))

    def tfidf_feature_extraction(self):
        self.join_column()
        tfidf_vectorizer = TfidfVectorizer(max_features=5000)
        X_tfidf = tfidf_vectorizer.fit_transform(self.train_data['joined_token_no_stopwords'])
        # Convert TF-IDF matrix to numpy array.
        return X_tfidf.toarray()
        
    def exec(self):    
        self.read_data()
        self.drop_id_column()
        self.apply_clean_tweet()
        self.tokenization()
        self.remove_stopwords()
        return self.tfidf_feature_extraction(), self.test_data


In [50]:
from sklearn.model_selection import train_test_split

dataPreprocessing = DataPreprocessing('dataset/train.csv', 'dataset/test.csv', 'text', 'label')

train_data, test_data = dataPreprocessing.exec()

X = train_data
y = dataPreprocessing.train_label

# Veriyi eğitim ve test setlerine ayır
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# PyTorch Tensor'larına dönüştür
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

print("Eğitim verisi boyutu:", X_train_tensor.shape)

Eğitim verisi boyutu: torch.Size([33918, 5000])


In [51]:
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, learning_rate=0.001):
        super(SimpleNN, self).__init__()
        # Define layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
        # Define loss function and optimizer
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
    
    def forward(self, x):
        # Forward pass
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out
    
    def fit(self, X_train, y_train, epoch_size=100):
        # Set the mode to train
        self.train()
        
        # Training Loop
        for epoch in range(epoch_size):
            # Forward pass
            outputs = self(X_train)
            loss = self.criterion(outputs, y_train)
            
            # Backpropagation) and Optimizer
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            # Print the loss every 10 epochs.
            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{epoch_size}], Loss: {loss.item():.4f}')
    
    def evaluate(self, X_test, y_test):
        # Set the mode to evaluation
        # Equivalent to model.train(False)
        self.eval()
        
        # Test verisiyle tahmin yapma
        # Make prediction with test data
        with torch.no_grad():
            test_outputs = self(X_test)
            _, predicted = torch.max(test_outputs, 1)
            accuracy = (predicted == y_test).float().mean()
            print(f"Accuracy: {accuracy:.4f}")
        return accuracy
        
    def download(self, PATH):
        # torch.save(model.state_dict(), PATH)
        torch.save(self.state_dict(), PATH)

In [52]:
# Model Parameters
input_dim = X_train_tensor.shape[1]  # Input Layer Dimension
hidden_dim = 64  # Hidden Layer Dimension
output_dim = len(set(y))  # Output Layer Length (or Label Size)

# Define Model
model = SimpleNN(input_dim, hidden_dim, output_dim)

# Train Model
model.fit(X_train_tensor, y_train_tensor, epoch_size=100)

# Evaluate Model
model.evaluate(X_test_tensor, y_test_tensor)

Epoch [10/100], Loss: 0.6715
Epoch [20/100], Loss: 0.6341
Epoch [30/100], Loss: 0.5880
Epoch [40/100], Loss: 0.5364
Epoch [50/100], Loss: 0.4846
Epoch [60/100], Loss: 0.4380
Epoch [70/100], Loss: 0.3994
Epoch [80/100], Loss: 0.3689
Epoch [90/100], Loss: 0.3449
Epoch [100/100], Loss: 0.3257
Accuracy: 0.8117


tensor(0.8117)

In [58]:
model.download("models/twitter_hatefull_speech_detection_model.pth")