In [1]:
import csv
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import re
from tqdm import tqdm

from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

!pip install scikit-multilearn

from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

from skmultilearn.problem_transform import LabelPowerset

from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_data_path = "/content/drive/MyDrive/train.csv"
train_data_raw = pd.read_csv(train_data_path)
print(train_data_raw.shape)

(159571, 8)


In [4]:
train_sentences = train_data_raw['comment_text'].values
cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_labels = train_data_raw[cols].values

In [5]:
train_sentences.shape, train_labels.shape

((159571,), (159571, 6))

In [6]:
test_data_path = "/content/drive/MyDrive/test.csv"
test_data_raw = pd.read_csv(test_data_path)
print(test_data_raw.shape)

(153164, 2)


In [7]:
test_sentences = test_data_raw['comment_text'].values
test_id = test_data_raw['id'].values

In [8]:
test_sentences.shape, test_id.shape

((153164,), (153164,))

In [9]:
def data_preprocessing(sentences):

    tokenized_sentences = []
    for sentence in tqdm(sentences, desc = 'Tokenize'):
        sentence = sentence.lower()
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', sentence)

        sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
        sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)
        sentence = sentence.strip()
        sentence = sentence.replace("\n"," ")
        sentence = re.sub(r'[^A-Za-z ]', '', sentence)
        sentence = re.sub('\s+', ' ', sentence)

        tokenized_sentences.append(nltk.word_tokenize(sentence))

    stop_words = set(stopwords.words('english'))

    removed_stop_words = []
    for sentence in tqdm(tokenized_sentences, desc = 'Stop_word_removal'):
        curr_sentence = []
        for token in sentence:
            if not token in stop_words:
                curr_sentence.append(token)
        
        removed_stop_words.append(curr_sentence)

    lemmatizer = WordNetLemmatizer()

    lemmatized_sentence = []
    for sentence in tqdm(removed_stop_words, desc = 'Lemmatization'):
        curr_sentence = ""
        for token in sentence:
            curr_sentence += lemmatizer.lemmatize(token) + " "

        lemmatized_sentence.append(curr_sentence)

    return lemmatized_sentence

In [10]:
train_sentences = data_preprocessing(train_sentences)

Tokenize: 100%|██████████| 159571/159571 [01:24<00:00, 1888.52it/s]
Stop_word_removal: 100%|██████████| 159571/159571 [00:02<00:00, 63943.52it/s]
Lemmatization: 100%|██████████| 159571/159571 [00:32<00:00, 4938.59it/s]


In [11]:
test_sentences = data_preprocessing(test_sentences)

Tokenize: 100%|██████████| 153164/153164 [00:59<00:00, 2556.91it/s]
Stop_word_removal: 100%|██████████| 153164/153164 [00:07<00:00, 19876.56it/s]
Lemmatization: 100%|██████████| 153164/153164 [00:30<00:00, 5024.48it/s]


In [12]:
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', max_features=10000)
vectorizer.fit(train_sentences)
vectorizer.fit(test_sentences)

In [13]:
x_train = vectorizer.transform(train_sentences)
y_train = train_labels

x_test = vectorizer.transform(test_sentences)

In [14]:
# x_train = pd.DataFrame(x_train)
# x_test = pd.DataFrame(x_test)

In [15]:
# pca = PCA(n_components=10000)
# x_train = pca.fit_transform(x_train)
# x_test = pca.fit_transform(x_test)

In [16]:
x_train.shape, y_train.shape, x_test.shape

((159571, 10000), (159571, 6), (153164, 10000))

In [17]:
def save_csv(file_name, y_pred, test_id) :
    header = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

    with open(file_name, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)
        for i in tqdm(range(len(y_pred))):
            row = [test_id[i], y_pred[i][0], y_pred[i][1], y_pred[i][2], y_pred[i][3], y_pred[i][4], y_pred[i][5]]
            writer.writerow(row)

In [18]:
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

y_pred_one_vs_rest = np.empty((x_test.shape[0], 0))

for ind in range(6):
    LogReg_pipeline.fit(x_train, y_train[:, ind])
    y_pred = LogReg_pipeline.predict(x_test)
    y_pred_one_vs_rest = np.concatenate((y_pred_one_vs_rest, y_pred[:, np.newaxis]), axis=1)

save_csv("one_vs_rest.csv", y_pred_one_vs_rest, test_id)

100%|██████████| 153164/153164 [00:02<00:00, 61136.50it/s]


In [None]:
classifier = BinaryRelevance(GaussianNB())

classifier.fit(x_train, y_train)

y_pred_br = classifier.predict(x_test)

save_csv("BinaryRelevance.csv", y_pred_br, test_id)

In [None]:
classifier = ClassifierChain(LogisticRegression())

classifier.fit(x_train, y_train)

y_pred_classifier_chain = classifier.predict(x_test)

save_csv("classifier_chain.csv", y_pred_classifier_chain, test_id)

In [None]:
classifier = LabelPowerset(LogisticRegression())

classifier.fit(x_train, y_train)

y_pred_label_powerset = classifier.predict(x_test)

save_csv("LabelPowerset.csv", y_pred_label_powerset, test_id)

In [None]:
classifier_new = MLkNN(k=10)

x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()

classifier_new.fit(x_train, y_train)

y_pred_adapted_algo = classifier_new.predict(x_test)

save_csv("Adapted_Algo.csv", y_pred_adapted_algo, test_id)