In [None]:
import pandas as pd
from pathlib import Path

In [None]:
import re
import shutil
from collections import Counter

import numpy as np
import pandas as pd
from IPython.display import display
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import ClassifierChain
from tqdm import tqdm

DATA_DIR = "/kaggle/input/jigsaw-toxic-comment-classification-challenge/"
OUTPUT_DIR = "/kaggle/working/"


In [None]:
def unpack_zipfile(filename):
    """Unpacks zip-file by name from DATA_DIR to OUTPUT_DIR."""
    try:
        shutil.unpack_archive(
            filename=DATA_DIR + filename,
            extract_dir=OUTPUT_DIR,
            format="zip",
        )
    except Exception as e:
        print(e)
    else:
        print(f"Archive file '{filename}' has been unpacked successfully.")


In [None]:
unpack_zipfile(filename="train.csv.zip")
unpack_zipfile(filename="test.csv.zip")
unpack_zipfile(filename="test_labels.csv.zip")


In [None]:
train_df = pd.read_csv(OUTPUT_DIR + "train.csv")
test_df = pd.read_csv(OUTPUT_DIR + "test.csv")


In [None]:
train_df.head()


In [None]:
test_df.head()


In [None]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text"].str.lower()


In [None]:
cols = ["comment_text", "comment_text_preprocessed"]
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


In [None]:
eng_stopwords = set(nltk_stopwords.words('english'))
# Adding new stopwords not initially included
eng_stopwords.update(["i'm", "that's", "can't"])
eng_stopwords


In [None]:
def clear_stopwords(comment_text, stopwords=eng_stopwords):
    """Removes stopwords from the commentary text."""
    # Checking if a word is a stopword
    comment_text_cleared = [word for word in str(comment_text).split() 
                              if word not in stopwords]
    
    return " ".join(comment_text_cleared)


In [None]:
# Example of function usage
train_text_2 = train_df["comment_text_preprocessed"].iloc[2]

print("Inp:\n\n{}\n".format(train_text_2))
print("Out:\n\n{}".format(clear_stopwords(train_text_2)))


In [None]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_stopwords(comment_text)
    )


In [None]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


In [None]:
word_counter = Counter()
for comment_text in train_df["comment_text_preprocessed"].values:
    for word in comment_text.split():
        word_counter[word] += 1

word_counter.most_common(10)


In [None]:
freq_words = set([word for (word, word_count) in word_counter.most_common(10)])
freq_words


In [None]:
def clear_freqwords(comment_text, freqwords=freq_words):
    """Removes top-10 frequent words."""
    
    comment_text_cleared = [word for word in str(comment_text).split() 
                              if word not in freq_words]
    
    return " ".join(comment_text_cleared)


In [None]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_freqwords(comment_text)
    )


In [None]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


In [None]:
rare_words_num = 10
rare_words = set([word for (word, word_count) 
                  in word_counter.most_common()[:-rare_words_num-1:-1]])
rare_words


In [None]:
def clear_rarewords(comment_text, rarewords=rare_words):
    """Removes top-10 rarest words."""
    
    comment_text_cleared = [word for word in str(comment_text).split() 
                              if word not in rare_words]
    
    return " ".join(comment_text_cleared)


In [None]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_rarewords(comment_text)
    )


In [None]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


In [None]:
def clear_urls(comment_text):
    """Clears the comment text from URLs."""
    
    url_regex_pattern = re.compile(r'https?://\S+|www\.\S+')
    
    return url_regex_pattern.sub(r"", comment_text)


In [None]:
train_text_900 = train_df["comment_text_preprocessed"].iloc[-900]

print("Inp:\n\n{}\n".format(train_text_900))
print("Out:\n\n{}".format(clear_urls(train_text_900)))


In [None]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_urls(comment_text)
    )


In [None]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


In [None]:
regex = re.compile(r"[a-zA-Z]+")

def leave_words_only(comment_text, regex=regex):
    """Removes non-word inclusions."""
    
    return " ".join(regex.findall(comment_text))


In [None]:
train_text = train_df["comment_text_preprocessed"].iloc[-1]

print("Inp:\n\n{}\n".format(train_text))
print("Out:\n\n{}".format(leave_words_only(train_text)))


In [None]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: leave_words_only(comment_text)
    )


In [None]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


In [None]:
other_eng_stopwords = [word for word in eng_stopwords if "'" in word]
other_eng_stopwords


In [None]:
# Removing negation-stopwords initially written without apostrophe
other_eng_stopwords = [word.replace("'", "") for word in other_eng_stopwords]
other_eng_stopwords


In [None]:
word_counter = Counter()
for comment_text in train_df["comment_text_preprocessed"].values:
    for word in comment_text.split():
        word_counter[word] += 1

word_counter.most_common(100)


In [None]:
# Adding additional stopwords
eng_stopwords.update(
    [
        "utc", "eg", 
        "jpg", "didnt",
        "th", "oh", 
        "im", "cant", 
        "wp", "hi",
    ]
)
eng_stopwords.update(other_eng_stopwords)
eng_stopwords


In [None]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_stopwords(
            comment_text, stopwords=eng_stopwords,
        )
    )


In [None]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_freqwords(comment_text)
    )


In [None]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


In [None]:
target_cols = train_df.columns[2:-1]
target_train = train_df[target_cols].values
target_train[:5]


In [None]:
corpus_train = train_df["comment_text_preprocessed"].values.astype("U")
corpus_train[:5]


In [None]:
corpus_test = test_df["comment_text_preprocessed"].values.astype("U")
corpus_test[:5]


In [None]:
vectorizer = TfidfVectorizer(
    max_features=1700,
    min_df=0.0011,
    max_df=0.35,
    norm="l2",
)


In [None]:
features_train = vectorizer.fit_transform(corpus_train)
features_train.shape


In [None]:
features_test = vectorizer.transform(corpus_test)
features_test.shape


In [None]:
base_estimator = LogisticRegression(
    class_weight="balanced",
    max_iter=10000,
    multi_class="multinomial",
    C=0.009,
    penalty="l2",
    n_jobs=-1,
)


In [None]:
chains = [
    ClassifierChain(
        base_estimator=base_estimator,
        order="random", 
        random_state=i,
    ) for i in range(10)
]

for i in tqdm(range(len(chains))):
    chains[i].fit(features_train, target_train)
print()


In [None]:
predictions = np.array([chain.predict_proba(features_test) 
                          for chain in chains])
proba_predictions_test = predictions.mean(axis=0)
proba_predictions_test


In [None]:
submission = pd.DataFrame(
    proba_predictions_test, 
    columns=target_cols,
    index=test_df.id
).reset_index()

submission.head()


In [None]:
submission.info()


In [None]:
# Saving the submission
submission.to_csv('submission.csv', index=False)

# Displaying the success message
print("The submission has been successfully saved.")
