In [1]:
from google.colab import drive
import pandas as pd
import numpy as np

import nltk
import re
from collections import Counter
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
df = pd.read_csv("/content/gdrive/MyDrive/NLP/train.csv.zip")

In [5]:
df.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [6]:
df.shape

(1804874, 45)

In [7]:
#psych_df contains comments related to psychiatric illness or mental disabilities
temp = df.dropna(subset=['psychiatric_or_mental_illness'])
psych_df = temp[temp['psychiatric_or_mental_illness'] >= 0.5]

In [8]:
psych_df.shape

(4889, 45)

In [9]:
psych_df.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
68,239669,0.0,I think you left out one very important organi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27081,approved,0,0,0,1,0,0.0,4,4
609,240813,0.6,This is what a serious mental illness looks li...,0.0,0.0,0.0,0.6,0.0,0.0,0.0,...,33626,approved,0,0,0,4,0,0.0,4,10
648,240878,0.545455,This is what a serious mental illness looks li...,0.090909,0.0,0.272727,0.545455,0.0,0.0,0.0,...,33626,approved,0,0,0,3,0,0.0,10,11
842,241121,0.6,Part 3\nThe owner of Remedy Wine Bar (which cl...,0.0,0.1,0.0,0.6,0.0,0.0,0.0,...,34877,approved,0,0,0,1,0,0.0,4,10
931,241239,0.2,"Ignore Glenn Beck he feeds off of attention, s...",0.0,0.0,0.0,0.2,0.0,0.0,0.0,...,32792,approved,0,0,0,0,0,0.0,10,5


In [10]:
# dividing between toxic and non-toxic comments
psych_toxic_df = psych_df[psych_df['target'] >= 0.5]
psych_nontoxic_df = psych_df[psych_df['target'] < 0.5]

psych_toxic_df.shape, psych_nontoxic_df.shape

((1030, 45), (3859, 45))

In [11]:
psych_toxic = psych_toxic_df['comment_text'].values
psych_nontoxic = psych_nontoxic_df['comment_text'].values

In [12]:
# upsampling the toxic comments by random repetition to have same number of comments as non-toxic set 
for _ in range(0, 2829):
    i = np.random.randint(low=0, high=1030)
    psych_toxic = np.concatenate((psych_toxic, [psych_toxic[i]]))

psych_toxic.shape, psych_nontoxic.shape

((3859,), (3859,))

In [13]:
# data-points which have no mention of any disability
temp = df.dropna(subset=['psychiatric_or_mental_illness','other_disability',
                         'intellectual_or_learning_disability','physical_disability'])

non_disab_df = temp[(temp['psychiatric_or_mental_illness'] < 0.5) & (temp['other_disability'] < 0.5) 
                        & (temp['physical_disability'] < 0.5) & (temp['intellectual_or_learning_disability'] < 0.5)
                   ]

In [14]:
# dividng between toxic and non-toxic (and choosing top 3859)
non_disab_toxic = non_disab_df[non_disab_df['target'] >= 0.5][:3859]['comment_text']
non_disab_non_toxic = non_disab_df[non_disab_df['target'] < 0.5][:3859]['comment_text']

In [15]:
non_disab_toxic.shape, non_disab_non_toxic.shape

((3859,), (3859,))

In [16]:
# concatenate toxic and non-toxic comments
toxic_comments = np.concatenate((psych_toxic, non_disab_toxic))
nontoxic_comments = np.concatenate((psych_nontoxic, non_disab_non_toxic))

In [17]:
# generating unigram and bigram for both - toxic and non-toxic comments
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
res_toxic = [item for comment in toxic_comments for n in range(1, 3) for item in nltk.ngrams(comment.split(), n)]
res_nontoxic = [item for comment in nontoxic_comments for n in range(1, 3) for item in nltk.ngrams(comment.split(), n)]

In [18]:
# converting it to dictionary with counts
res_toxic = dict(Counter(res_toxic))
res_nontoxic = dict(Counter(res_nontoxic))

In [19]:
# calculating log-odds 
res = {}
sum_toxic = sum(res_toxic.values())
sum_nontoxic = sum(res_nontoxic.values())
for key, val in res_toxic.items():
    p1 = val/(sum_toxic-val)
    
    # in-case the same gram is present in non-toxic grams too. 
    if key in res_nontoxic.keys():
        p2 = res_nontoxic[key]/(sum_nontoxic-res_nontoxic[key])
        newVal = np.log(p1/p2)
    
    # else performing one correction to avoid division by zero
    else:
        p2 = 1/(sum_nontoxic+1)
        newVal = np.log(p1/p2)
    
    # only adding if log-odd ratio is greater than 1.96
    if newVal > 1.96:
        res[key] = newVal

In [20]:
# sorting the dict in reverse order and saving the results in a text file
lst = sorted(res, key=res.get, reverse=True)
file = open("/content/gdrive/MyDrive/NLP/results.txt","w")

for l in lst:
    file.write(str(l)+" : "+ str(res[l]))
    file.write("\n")

file.close()

In [23]:
# printing the results
for l in lst[:1000]:
    print(str(l)+" : "+str(res[l]))


('idiot',) : 4.968253671876022
('malignant', 'narcissist') : 4.444345342833287
('narcissist', 'and') : 4.340797314139767
('liar,',) : 4.071117822911834
('is', 'profoundly') : 4.071117822911834
('malignant',) : 4.022373329743431
('Liberalism',) : 4.016198740554705
('stupidity',) : 3.9970061759605477
('idiots.',) : 3.9710312198328275
('Liberalism', 'is') : 3.9031007692766297
('stupid,',) : 3.888789446816296
('Stupid',) : 3.7991736133605536
('liar', 'and') : 3.7991718581554927
('tweets',) : 3.734630887181387
('profoundly', 'mentally') : 3.7178584144785027
('The', 'malignant') : 3.7007298657947496
('Trump', 'did') : 3.7007298657947496
('sexual', 'predator') : 3.700728110589689
('narcissist',) : 3.6657024081794174
('stupid', 'to') : 3.6656373210689632
('pathological', 'liar') : 3.6656373210689632
('fool.',) : 3.6656373210689632
('buffoon.',) : 3.6292684519850726
('deranged', 'sociopath') : 3.6292684519850726
('secret.',) : 3.5915268990907103
('sociopath', 'has') : 3.5915268990907103
('an', 