In [1]:
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import spacy
import numpy as np
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class TextDataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        self.df.dropna(inplace=True, how='any')
        self.vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 1))
        self.X = self.vectorizer.fit_transform(self.df.iloc[:,0])
        self.y = np.round(np.array(self.df.iloc[:, 1]), )
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X,
                                                                                self.y,
                                                                                test_size=0.2,
                                                                                shuffle=True,
                                                                                # random_state=42
                                                                               )
    
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        return self.X[0], self.y[0]
    
    def transform(self, X):
        return self.vectorizer.transform(X)

    def get_train_data(self):
        return self.X_train, self.y_train
    
    def get_test_data(self):
        return self.X_test, self.y_test

    def preprocess(self, x):
        return ''.join([word for word in x.split(' ') if word not in stopwords.words()])

td = TextDataset('/kaggle/input/toxic-comment-detection-multilingual-extended/archive/russian/labeled.csv')
# td = TextDataset('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train-processed-seqlen128.csv')
print(td[0])

(<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6 stored elements and shape (1, 68423)>, 1.0)


In [2]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
X, y = td.get_train_data()
model.fit(X, y)

In [3]:
from sklearn.metrics import roc_auc_score

X, y = td.get_test_data()
y_pred = model.predict(X)
roc_auc_score(y, y_pred)

0.7400970142665536