In [1]:
import pandas as pd
from pathlib import Path

In [2]:
import re
import shutil
from collections import Counter

import numpy as np
import pandas as pd
from IPython.display import display
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import ClassifierChain
from tqdm import tqdm

DATA_DIR = "/kaggle/input/jigsaw-toxic-comment-classification-challenge/"
OUTPUT_DIR = "/kaggle/working/"


In [3]:
def unpack_zipfile(filename):
    """Unpacks zip-file by name from DATA_DIR to OUTPUT_DIR."""
    try:
        shutil.unpack_archive(
            filename=DATA_DIR + filename,
            extract_dir=OUTPUT_DIR,
            format="zip",
        )
    except Exception as e:
        print(e)
    else:
        print(f"Archive file '{filename}' has been unpacked successfully.")


In [4]:
unpack_zipfile(filename="train.csv.zip")
unpack_zipfile(filename="test.csv.zip")
unpack_zipfile(filename="test_labels.csv.zip")


Archive file 'train.csv.zip' has been unpacked successfully.
Archive file 'test.csv.zip' has been unpacked successfully.
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip is not a zip file


In [5]:
train_df = pd.read_csv(OUTPUT_DIR + "train.csv")
test_df = pd.read_csv(OUTPUT_DIR + "test.csv")


In [6]:
train_df.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
test_df.head()


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [8]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text"].str.lower()


In [9]:
cols = ["comment_text", "comment_text_preprocessed"]
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


Unnamed: 0,comment_text,comment_text_preprocessed
136433,REDIRECT Talk:Small Planet Airlines,redirect talk:small planet airlines
143743,"Sorry for the late reply, they are all off now.","sorry for the late reply, they are all off now."
136994,Decade montage images \n\nWHAT THE FUCK DID YO...,decade montage images \n\nwhat the fuck did yo...
89824,"Was your rebuttal completed? Then, let's have ...","was your rebuttal completed? then, let's have ..."
53817,"You added no source, and no value to the page,...","you added no source, and no value to the page,..."


Unnamed: 0,comment_text,comment_text_preprocessed
25869,Page was vandalized ! Freedom Front Plus was c...,page was vandalized ! freedom front plus was c...
135184,::::::::You are so wrong. I blame the aristoc...,::::::::you are so wrong. i blame the aristoc...
72853,==reassurence for those who think that all pos...,==reassurence for those who think that all pos...
89130,""" \n *:Ditto. It's the disenchantment of the W...",""" \n *:ditto. it's the disenchantment of the w..."
121936,"==Go Ahead== \n\n Ban me, you asshole. As if y...","==go ahead== \n\n ban me, you asshole. as if y..."


In [10]:
eng_stopwords = set(nltk_stopwords.words('english'))
# Adding new stopwords not initially included
eng_stopwords.update(["i'm", "that's", "can't"])
eng_stopwords


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 "can't",
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'o

In [11]:
def clear_stopwords(comment_text, stopwords=eng_stopwords):
    """Removes stopwords from the commentary text."""
    # Checking if a word is a stopword
    comment_text_cleared = [word for word in str(comment_text).split() 
                              if word not in stopwords]
    
    return " ".join(comment_text_cleared)


In [12]:
# Example of function usage
train_text_2 = train_df["comment_text_preprocessed"].iloc[2]

print("Inp:\n\n{}\n".format(train_text_2))
print("Out:\n\n{}".format(clear_stopwords(train_text_2)))


Inp:

hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.

Out:

hey man, really trying edit war. guy constantly removing relevant information talking edits instead talk page. seems care formatting actual info.


In [13]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_stopwords(comment_text)
    )


In [14]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


Unnamed: 0,comment_text,comment_text_preprocessed
94318,A big thank you\nHello Jeandré. I just wanted ...,big thank hello jeandré. wanted say big thank ...
55229,"""\n{| style=""""background-color:#F5FFFA; paddin...",""" {| style=""""background-color:#f5fffa; padding..."
155108,firstable this is a myth!\nsecondable no sourc...,firstable myth! secondable sources almost year...
104186,""":Instead of asking me personal questions abou...",""":instead asking personal questions believe, q..."
78727,And why the fuck should i care? \n\n67.165.71.214,fuck care? 67.165.71.214


Unnamed: 0,comment_text,comment_text_preprocessed
30669,""": Got a Policy page link for that or is it ju...",""": got policy page link convention? even chang..."
65113,""" \n\n Please stop. If you continue to vandali...",""" please stop. continue vandalize pages, virgi..."
76263,:The simple jews are lice ridden scum slur tha...,:the simple jews lice ridden scum slur origina...
52987,""" \n\n :You're welcome, congratulations, and g...",""" :you're welcome, congratulations, good luck!..."
151753,Can we have a history section ? When was it bu...,history section ? built ? …


In [15]:
word_counter = Counter()
for comment_text in train_df["comment_text_preprocessed"].values:
    for word in comment_text.split():
        word_counter[word] += 1

word_counter.most_common(10)


[('"', 81755),
 ('article', 39018),
 ('would', 29058),
 ('page', 28731),
 ('please', 27513),
 ('like', 26338),
 ('one', 24636),
 ('-', 23412),
 ('talk', 22839),
 ('wikipedia', 22438)]

In [16]:
freq_words = set([word for (word, word_count) in word_counter.most_common(10)])
freq_words


{'"',
 '-',
 'article',
 'like',
 'one',
 'page',
 'please',
 'talk',
 'wikipedia',
 'would'}

In [17]:
def clear_freqwords(comment_text, freqwords=freq_words):
    """Removes top-10 frequent words."""
    
    comment_text_cleared = [word for word in str(comment_text).split() 
                              if word not in freq_words]
    
    return " ".join(comment_text_cleared)


In [18]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_freqwords(comment_text)
    )


In [19]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


Unnamed: 0,comment_text,comment_text_preprocessed
114106,"""Actually, I became interested in editing this...","""actually, became interested editing short (am..."
129150,"""\n\nHow about you guys not use Greek authors ...","guys use greek authors sources, since use name..."
145597,"Ask, and ye shall .","ask, ye shall ."
58637,Edit count \n\nThere is a tool to do it for yo...,edit count tool here. 512 total. )
25154,why do you not reply \n\nhi i keep getting mes...,reply hi keep getting messages vandalism editi...


Unnamed: 0,comment_text,comment_text_preprocessed
36114,== fuck off == \n\n fuck off,== fuck == fuck
79038,== Note 2 == \n \n Stop harrasing me! you was ...,== note 2 == stop harrasing me! invated disucss.
147905,Haha! Nearly rolled off my chair seeing today'...,haha! nearly rolled chair seeing today's fa. h...
138352,""" \n \n * This is nonsense on a number of leve...",* nonsense number levels: 1) section added any...
105093,""" \n\n ==POV tone== \n Hi all, just wanted to ...","==pov tone== hi all, wanted say """"neutral unbi..."


In [20]:
rare_words_num = 10
rare_words = set([word for (word, word_count) 
                  in word_counter.most_common()[:-rare_words_num-1:-1]])
rare_words


{'""bragging""',
 '""luxury',
 '""luxury""',
 '""luxury"".',
 '(boastful',
 '(psc',
 'automakers',
 'ciu)',
 'superlatives)',
 'vehicle"".'}

In [21]:
def clear_rarewords(comment_text, rarewords=rare_words):
    """Removes top-10 rarest words."""
    
    comment_text_cleared = [word for word in str(comment_text).split() 
                              if word not in rare_words]
    
    return " ".join(comment_text_cleared)


In [22]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_rarewords(comment_text)
    )


In [23]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


Unnamed: 0,comment_text,comment_text_preprocessed
63218,Around blacks \n\nNever relax! 86.181.0.14,around blacks never relax! 86.181.0.14
101815,dudes change it how it was before \nsome dude ...,"dudes change dude vandalized this, tried fix c..."
94150,Thanks for that; I was looking for a way of al...,thanks that; looking way alerting right people...
118121,"""::*Rockpocket, please don't lecture about hat...","""::*rockpocket, lecture hate websites. ran sou..."
67592,"""\n\nSeems like a given, but you never know wi...","seems given, never know wikipedia. /"


Unnamed: 0,comment_text,comment_text_preprocessed
85841,==Edits== \n The last extensive edit by an ano...,==edits== last extensive edit anon seems whack...
123248,so my BUTT smell like crap!!!!!!!!!!!!!!!!!!!!...,butt smell crap!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...
48366,""" \n\n == personal attack (not by you) == \n\n...",== personal attack (not you) == suggest report...
10235,hey this is the baddest chick in the world!!!!...,hey baddest chick world!!!!!!!!!!!!!(wats up)
130524,I am Gary DeCarlo and I'm telling you and ever...,gary decarlo telling everyone else j. deleone ...


In [24]:
def clear_urls(comment_text):
    """Clears the comment text from URLs."""
    
    url_regex_pattern = re.compile(r'https?://\S+|www\.\S+')
    
    return url_regex_pattern.sub(r"", comment_text)


In [25]:
train_text_900 = train_df["comment_text_preprocessed"].iloc[-900]

print("Inp:\n\n{}\n".format(train_text_900))
print("Out:\n\n{}".format(clear_urls(train_text_900)))


Inp:

copyright problem removed prior content duplicated previously published sources. material copied from: http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2287120. copied closely paraphrased material rewritten removed must restored, unless duly released compatible license. (for information, see ""using copyrighted works others"" copyright holder material, ""donating copyrighted materials"" are.) legal reasons, cannot accept copyrighted text images borrowed web sites published material; additions deleted. contributors may use copyrighted publications source information, according fair use may copy sentences phrases, provided included quotation marks referenced properly. material may also rewritten, infringe copyright original plagiarize source. therefore paraphrased portions must provide source. see guideline non-free text properly implement limited quotations copyrighted text. takes copyright violations seriously, persistent violators blocked editing. appreciate contributions, mus

In [26]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_urls(comment_text)
    )


In [27]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


Unnamed: 0,comment_text,comment_text_preprocessed
99344,Re: Creating a username \n\nThanks for your me...,"re: creating username thanks message, remain i..."
132642,"""\n\n Wikipedia:Requests for checkuser/Case/Na...",wikipedia:requests checkuser/case/najiimp doub...
114862,"""\n\n Redirect \n\nI dunno how to do dis but c...","redirect dunno dis someone redirect """"dino ext..."
24367,"""\n\n Your George Allen comments \n\nThe Wikim...",george allen comments wikimedia software sort ...
133834,"""\nRight, ok, I didn't understand about the UR...","right, ok, understand url. make worthwhile. su..."


Unnamed: 0,comment_text,comment_text_preprocessed
94599,"Answer: Social, Natural, Spiritual. It's not t...","answer: social, natural, spiritual. unclear."
59431,:Response given at Talk:Fuck for Forest#Notabi...,:response given talk:fuck forest#notability la...
126191,"As explained above, the Douglas Archives artic...","explained above, douglas archives also mine."
42322,"== M Special Unit == \n\n Hey, just letting yo...","== special unit == hey, letting know created s..."
107170,""" \n\n == 24 - OK, I'm getting bored now == \n...","== 24 ok, getting bored == push hairiest anal ..."


In [28]:
regex = re.compile(r"[a-zA-Z]+")

def leave_words_only(comment_text, regex=regex):
    """Removes non-word inclusions."""
    
    return " ".join(regex.findall(comment_text))


In [29]:
train_text = train_df["comment_text_preprocessed"].iloc[-1]

print("Inp:\n\n{}\n".format(train_text))
print("Out:\n\n{}".format(leave_words_only(train_text)))


Inp:

... really think understand. came idea bad right away. kind community goes ""you bad ideas"" go away, instead helping rewrite them.

Out:

really think understand came idea bad right away kind community goes you bad ideas go away instead helping rewrite them


In [30]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: leave_words_only(comment_text)
    )


In [31]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


Unnamed: 0,comment_text,comment_text_preprocessed
94814,Question for you.... \n\nIs this content verif...,question you content verifiable two seem know ...
137580,"I did, and I'll be sure to update it when IV c...",did sure update iv comes out miss turn
53371,"If you block me, I will just get another IP ad...",block me get another ip address ass
99288,"""\n\nA tag has been placed on Kevin wunderlich...",tag placed kevin wunderlich requesting speedil...
81240,"All right, I completely understand and won't t...",right completely understand try start creating...


Unnamed: 0,comment_text,comment_text_preprocessed
7737,:The page I found at SKOAR! was just a redirec...,the found skoar redirect digit magazine magazi...
131747,""" \n\n :To follow up what Beagel says above, a...",to follow beagel says above incorrect details ...
28587,"::::Twice as many as any other, is not primary...",twice many other primary use twice many views ...
98264,== The Dark Knight on imdb == \n\n The Dark Kn...,dark knight imdb dark knight imdb shawshank go...
1725,That's as maybe pal but it's still a massive s...,maybe pal still massive scam sunday sport lins...


In [32]:
other_eng_stopwords = [word for word in eng_stopwords if "'" in word]
other_eng_stopwords


["we've",
 "hadn't",
 "they'd",
 "it'd",
 "we'll",
 "shan't",
 "won't",
 "he'd",
 "i'll",
 "aren't",
 "i'm",
 "they've",
 "wouldn't",
 "didn't",
 "you've",
 "wasn't",
 "mightn't",
 "he's",
 "weren't",
 "you're",
 "she'd",
 "mustn't",
 "we'd",
 "it's",
 "they're",
 "doesn't",
 "they'll",
 "don't",
 "you'd",
 "it'll",
 "that's",
 "i'd",
 "shouldn't",
 "we're",
 "should've",
 "can't",
 "needn't",
 "haven't",
 "you'll",
 "isn't",
 "i've",
 "she'll",
 "hasn't",
 "that'll",
 "couldn't",
 "she's",
 "he'll"]

In [33]:
# Removing negation-stopwords initially written without apostrophe
other_eng_stopwords = [word.replace("'", "") for word in other_eng_stopwords]
other_eng_stopwords


['weve',
 'hadnt',
 'theyd',
 'itd',
 'well',
 'shant',
 'wont',
 'hed',
 'ill',
 'arent',
 'im',
 'theyve',
 'wouldnt',
 'didnt',
 'youve',
 'wasnt',
 'mightnt',
 'hes',
 'werent',
 'youre',
 'shed',
 'mustnt',
 'wed',
 'its',
 'theyre',
 'doesnt',
 'theyll',
 'dont',
 'youd',
 'itll',
 'thats',
 'id',
 'shouldnt',
 'were',
 'shouldve',
 'cant',
 'neednt',
 'havent',
 'youll',
 'isnt',
 'ive',
 'shell',
 'hasnt',
 'thatll',
 'couldnt',
 'shes',
 'hell']

In [34]:
word_counter = Counter()
for comment_text in train_df["comment_text_preprocessed"].values:
    for word in comment_text.split():
        word_counter[word] += 1

word_counter.most_common(100)


[('s', 48434),
 ('wikipedia', 23709),
 ('see', 21604),
 ('it', 21206),
 ('also', 20643),
 ('think', 20083),
 ('know', 19146),
 ('you', 19053),
 ('article', 18514),
 ('people', 18454),
 ('edit', 18284),
 ('page', 17689),
 ('use', 16701),
 ('articles', 16663),
 ('time', 15869),
 ('may', 15615),
 ('talk', 14531),
 ('user', 14176),
 ('thanks', 13912),
 ('even', 13467),
 ('get', 13416),
 ('make', 13005),
 ('good', 12822),
 ('well', 12439),
 ('information', 12170),
 ('could', 11969),
 ('want', 11614),
 ('deletion', 11494),
 ('sources', 11374),
 ('way', 11278),
 ('name', 11237),
 ('image', 11041),
 ('first', 10986),
 ('wp', 10922),
 ('help', 10737),
 ('pages', 10697),
 ('me', 10685),
 ('new', 10668),
 ('source', 10379),
 ('editing', 10364),
 ('go', 10325),
 ('need', 10181),
 ('section', 10167),
 ('say', 10131),
 ('here', 10096),
 ('fuck', 10085),
 ('edits', 9991),
 ('thank', 9906),
 ('made', 9682),
 ('many', 9589),
 ('much', 9484),
 ('i', 9481),
 ('this', 9379),
 ('used', 9212),
 ('really', 9

In [35]:
# Adding additional stopwords
eng_stopwords.update(
    [
        "utc", "eg", 
        "jpg", "didnt",
        "th", "oh", 
        "im", "cant", 
        "wp", "hi",
    ]
)
eng_stopwords.update(other_eng_stopwords)
eng_stopwords


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'arent',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 "can't",
 'cant',
 'couldn',
 "couldn't",
 'couldnt',
 'd',
 'did',
 'didn',
 "didn't",
 'didnt',
 'do',
 'does',
 'doesn',
 "doesn't",
 'doesnt',
 'doing',
 'don',
 "don't",
 'dont',
 'down',
 'during',
 'each',
 'eg',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'hadnt',
 'has',
 'hasn',
 "hasn't",
 'hasnt',
 'have',
 'haven',
 "haven't",
 'havent',
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'hed',
 'hell',
 'her',
 'here',
 'hers',
 'herself',
 'hes',
 'hi',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'id',
 'if',
 'ill',
 'im',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'isnt',
 'it',
 "it'd",
 "it'll",
 "it's",
 'itd',
 'itll',
 'its',
 'itself',
 'ive',
 'jpg',
 'just'

In [36]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_stopwords(
            comment_text, stopwords=eng_stopwords,
        )
    )


In [37]:
for df in (train_df, test_df):
    df["comment_text_preprocessed"] = df["comment_text_preprocessed"] \
    .apply(
        lambda comment_text: clear_freqwords(comment_text)
    )


In [38]:
display(train_df[cols].sample(5))
display(test_df[cols].sample(5))


Unnamed: 0,comment_text,comment_text_preprocessed
90062,further harassment from Jance/Jgwlaw\n\nJance ...,harassment jance jgwlaw jance reverting edits ...
80240,Edit reversion on 'Food for Fighters' \n\nIf y...,edit reversion food fighters notice keep rever...
143967,"""\n\n The attack? \n\nI did not attack what's ...",attack attack name synchrocat seems response e...
32969,"Yeah, the only more obvious case I can think o...",yeah obvious case think gay club
102937,"""\n\nOne the userbox for the University of Geo...",userbox university georgia way delete category...


Unnamed: 0,comment_text,comment_text_preprocessed
71192,""" \n :::::::::: Both the editor & the edits ar...",editor edits wrong spelled random sentence per...
66602,"REDIRECT Talk:Venice, Los Angeles",redirect venice los angeles
18504,== List of Croatian architects == \n\n There a...,list croatian architects tons names wonder get...
71630,"== Wikipedia:WikiCup/2011 signups == \n\n Hey,...",wikicup signups hey concerning change flag sco...
90686,"I'm glad to see some fixing up genre articles,...",glad see fixing genre articles terrible


In [39]:
target_cols = train_df.columns[2:-1]
target_train = train_df[target_cols].values
target_train[:5]


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [40]:
corpus_train = train_df["comment_text_preprocessed"].values.astype("U")
corpus_train[:5]


array(['explanation edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac remove template since retired',
       'aww matches background colour seemingly stuck thanks january',
       'hey man really trying edit war guy constantly removing relevant information talking edits instead seems care formatting actual info',
       'make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later else first preferences formatting style references want let know appears backlog articles review guess may delay reviewer turns listed relevant form good nominations transport',
       'sir hero chance remember'], dtype='<U5000')

In [41]:
corpus_test = test_df["comment_text_preprocessed"].values.astype("U")
corpus_test[:5]


array(['yo bitch ja rule succesful ever whats hating sad mofuckas bitch slap ur pethedic white faces get kiss ass guys sicken ja rule pride da music man diss shit nothin wrong bein tupac brother fuckin white boys get things right next time',
       'rfc title fine imo', 'sources zawe ashton lapland',
       'look back source information updated correct form guess source updated shall update information thank message',
       'anonymously edit articles'], dtype='<U5000')

In [42]:
vectorizer = TfidfVectorizer(
    max_features=1700,
    min_df=0.0011,
    max_df=0.35,
    norm="l2",
)


In [43]:
features_train = vectorizer.fit_transform(corpus_train)
features_train.shape


(159571, 1700)

In [44]:
features_test = vectorizer.transform(corpus_test)
features_test.shape


(153164, 1700)

In [45]:
base_estimator = LogisticRegression(
    class_weight="balanced",
    max_iter=10000,
    multi_class="multinomial",
    C=0.009,
    penalty="l2",
    n_jobs=-1,
)


In [46]:
chains = [
    ClassifierChain(
        base_estimator=base_estimator,
        order="random", 
        random_state=i,
    ) for i in range(10)
]

for i in tqdm(range(len(chains))):
    chains[i].fit(features_train, target_train)
print()


100%|██████████| 10/10 [01:10<00:00,  7.09s/it]







In [47]:
predictions = np.array([chain.predict_proba(features_test) 
                          for chain in chains])
proba_predictions_test = predictions.mean(axis=0)
proba_predictions_test


array([[0.99633577, 0.98063941, 0.99536222, 0.95426546, 0.98616716,
        0.96050172],
       [0.21292851, 0.04550916, 0.10967578, 0.15872348, 0.14370415,
        0.15423253],
       [0.16820313, 0.0329666 , 0.08597448, 0.05337676, 0.10972736,
        0.12418618],
       ...,
       [0.13814624, 0.03653743, 0.08363966, 0.07061622, 0.08957529,
        0.12313889],
       [0.524306  , 0.27139515, 0.32751866, 0.37145417, 0.43793077,
        0.35272503],
       [0.96207212, 0.75071451, 0.89108851, 0.7113649 , 0.86706363,
        0.6377128 ]])

In [48]:
submission = pd.DataFrame(
    proba_predictions_test, 
    columns=target_cols,
    index=test_df.id
).reset_index()

submission.head()


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.996336,0.980639,0.995362,0.954265,0.986167,0.960502
1,0000247867823ef7,0.212929,0.045509,0.109676,0.158723,0.143704,0.154233
2,00013b17ad220c46,0.168203,0.032967,0.085974,0.053377,0.109727,0.124186
3,00017563c3f7919a,0.15005,0.037584,0.088099,0.090896,0.095607,0.078186
4,00017695ad8997eb,0.200844,0.025725,0.08643,0.084045,0.097337,0.084841


In [49]:
submission.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             153164 non-null  object 
 1   toxic          153164 non-null  float64
 2   severe_toxic   153164 non-null  float64
 3   obscene        153164 non-null  float64
 4   threat         153164 non-null  float64
 5   insult         153164 non-null  float64
 6   identity_hate  153164 non-null  float64
dtypes: float64(6), object(1)
memory usage: 8.2+ MB


In [50]:
# Saving the submission
submission.to_csv('submission.csv', index=False)

# Displaying the success message
print("The submission has been successfully saved.")


The submission has been successfully saved.
