# Misogynistic Aggression Detection

Detecting misogynistc aggression in an annotated corpus trained through a Support vector machine (SVM) model 

# Training Data

# "Developing a Multilingual Annotated Corpus of Misogyny andAggression" 

The Training dataset is an already labelled and previously developed corpus and it is cited as follows, A research 
paper titled
"Developing a Multilingual Annotated Corpus of Misogyny andAggression"
describes how it was extracted and labelled and what the criteria was for labelling.
There are five main labels/tags:

1.Overtly Aggresive(OAG)
2.Covertly Aggressive(CAG)
3.Non-Aggressive(NAG)
4.Gendered(GEN)
5.Non-Gendered(NGEN)



[Licensed under Creative Common Non-Commercial Share-Alike 4.0 licence CC-BY-NC-SA 4.0]

Citation:


@InProceedings{trac2-dataset,

author = {Bhattacharya, Shiladitya and Singh, Siddharth and Kumar, Ritesh and Bansal, Akanksha and Bhagat, Akash and Dawer, Yogesh and Lahiri, Bornini and Ojha, Atul Kr.},

title = {Developing a Multilingual Annotated Corpus of Misogyny and Aggression},

booktitle = {Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying},

month = {May},

year = {2020},

address = {Marseille, France},

publisher = {European Language Resources Association (ELRA)},

pages = {158--168},

url = {https://www.aclweb.org/anthology/2020.trac2-1.25}

}


# Importing All Relevant Libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import svm model
from sklearn import svm
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Loading The Dataset

In [2]:

# loading the training dataset
data = pd.read_csv("trac2_eng_train.csv", header=None)

In [3]:
data.head(10)


Unnamed: 0,0,1,2,3
0,ID,Text,Sub-task A,Sub-task B
1,C45.451,Next part,NAG,NGEN
2,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN
3,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN
4,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN
5,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN
6,C10.761,It seems like these people want to be famous n...,NAG,NGEN
7,C25.6,Best topic for Law Students !,NAG,NGEN
8,C7.1642,Even when kabir singh was unaware that Preeti ...,CAG,NGEN
9,C10.152.1,She is wrong .,NAG,NGEN


# Checking shape of dataset

In [4]:
data.shape


(4264, 4)

# Samples per class in data set (5 Classes)

In [5]:
def show_samples_per_class(data):
    
    #setting counters for each class to zero
    OAG_Samples = 0
    CAG_Samples = 0
    NAG_Samples = 0
    GEN_Samples = 0
    NGEN_Samples = 0
    
    #looping through the entire dataset to count samples per class
    for i in range(4264):
        # Sub-Task A
        if data.iloc[i,2] == 'OAG':
            OAG_Samples +=1
        if data.iloc[i,2] == 'CAG':
            CAG_Samples +=1
        if data.iloc[i,2] == 'NAG':
            NAG_Samples +=1
        
        # Sub-Task B
        if data.iloc[i,3] == 'NGEN':
            NGEN_Samples +=1
        if data.iloc[i,3] == 'GEN':
            GEN_Samples +=1
            
    # PRINT THE SAMPLE COUNTS       
    print('SUB-TASK A')        
    print('Number of Samples that are Overtly Agrresive  :', OAG_Samples)
    print('Number of Samples that are Covertly Agrresive :', CAG_Samples)
    print('Number of Samples that are Non Agrresive      :', NAG_Samples ,'\n')
    
    print('SUB-TASK B')
    print('Number of Samples that are GENDERED           :', GEN_Samples)
    print('Number of Samples that are NON GENDERED       :', NGEN_Samples)
 

In [6]:
show_samples_per_class(data)

SUB-TASK A
Number of Samples that are Overtly Agrresive  : 435
Number of Samples that are Covertly Agrresive : 453
Number of Samples that are Non Agrresive      : 3375 

SUB-TASK B
Number of Samples that are GENDERED           : 309
Number of Samples that are NON GENDERED       : 3954


# Average Number of words per sample in training set 

In [7]:
#Loading all the text to be analysed from corpus to list
data_text = []
for i in range(4264):
    data_text.append(data.iloc[i,1])

In [8]:
print(data_text)



In [9]:
def get_num_words_per_sample(sample_texts):
    """Returns the median number of words per sample given corpus.

    # Arguments
        sample_texts: list, sample texts.

    # Returns
        int, median number of words per sample.
    """
    num_words = [len(s.split()) for s in sample_texts]
    return np.median(num_words)


In [10]:
num_words_per_sample = get_num_words_per_sample(data_text)

In [11]:
print('Average number of words per Sample :',num_words_per_sample)

Average number of words per Sample : 8.0


# Text Preprocessing

In [12]:
#Renaming columns to make our life easier
data["Misogynistic_Aggression"] = ""
data.to_csv("sample.csv", index=False)
data.columns = ['ID','Text','subTask_a','subTask_b','Misogynistic_Aggression']
data = data.drop([0], axis=0)

In [13]:
data.head()

Unnamed: 0,ID,Text,subTask_a,subTask_b,Misogynistic_Aggression
1,C45.451,Next part,NAG,NGEN,
2,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN,
3,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN,
4,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN,
5,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN,


In [14]:
data.shape

(4263, 5)

In [15]:
#filling new column
for i in range(1,4264):
    data.Misogynistic_Aggression[i] = 0
    if data.subTask_a[i] == 'OAG' or data.subTask_a[i] == 'CAG':
        if data.subTask_b[i] == 'GEN':
            data.Misogynistic_Aggression[i] = 1

In [16]:
data.tail(20)

Unnamed: 0,ID,Text,subTask_a,subTask_b,Misogynistic_Aggression
4244,C10.459,"Absolutely, how the hell can she do this here ...",NAG,NGEN,0
4245,C4.2040,I totally agree with your thoughts..,NAG,NGEN,0
4246,C4.2229.1,Ghanta..ek no ek chutiya galti se iski video l...,OAG,NGEN,0
4247,C4.774.1,@Nil The Captain War was terrible generic garbage,CAG,NGEN,0
4248,C4.1633.2,@sachin sakri arjun Reddy is also a chutiya fi...,NAG,NGEN,0
4249,C4.1936,we need more men like you .,NAG,NGEN,0
4250,C59.1722,Nice Boos,NAG,NGEN,0
4251,C7.25,Pratik Bhai please watch aravinda sametha movi...,NAG,NGEN,0
4252,C4.313,"Sir I want , you make a video on Naruto or dea...",NAG,NGEN,0
4253,C59.40,nice boss,NAG,NGEN,0


In [17]:
#Now we can easily access a column like this
data.Text.head()


1                                            Next part
2                   Iii8mllllllm\nMdxfvb8o90lplppi0005
3    🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...
4    What the fuck was this? I respect shwetabh and...
5    Concerned authorities should bring arundathi R...
Name: Text, dtype: object

# Cleaning the text

In [18]:
def clean_text(data,length):
    for i in range(1,length):
        #Converting text to lowercase
        data.Text[i] =  data.Text[i].lower()
        #removing links
        re.sub(r'http\S+', '', data.Text[i].lower())
        #removing punctuation
        data.Text[i] = re.sub("[^-9A-Za-z ]", "" ,data.Text[i] )
  
    
    

In [19]:
clean_text(data,4264)

In [20]:
#After cleaning
data.Text.head()

1                                            next part
2                            iiimllllllmmdxfvbo9lplppi
3       osm vedio keep it upmake more vedios like this
4    what the fuck was this i respect shwetabh and ...
5    concerned authorities should bring arundathi r...
Name: Text, dtype: object

# Vectorising Text : TF-IDF

In [21]:

# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
vectors = vectorizer.fit_transform(data.Text)

In [22]:
print(vectorizer.get_feature_names())

['99', '999', '9a', 'able', 'about', 'above', 'absolute', 'absolutely', 'abuse', 'abusing', 'abusive', 'accent', 'accept', 'accepted', 'according', 'across', 'act', 'acting', 'action', 'actions', 'activities', 'actor', 'actors', 'actress', 'acts', 'actual', 'actually', 'add', 'addict', 'affect', 'after', 'again', 'against', 'age', 'agenda', 'ago', 'agree', 'agreed', 'akshay', 'alcohol', 'alcoholic', 'alert', 'alive', 'all', 'allow', 'allowed', 'allowing', 'almost', 'alone', 'already', 'also', 'always', 'am', 'amazing', 'amit', 'among', 'an', 'analysis', 'anchor', 'and', 'anger', 'angry', 'anime', 'another', 'answer', 'anti', 'any', 'anymore', 'anyone', 'anything', 'anyway', 'appreciate', 'apu', 'ar', 'are', 'area', 'arguments', 'arjun', 'arms', 'army', 'arnab', 'around', 'arrest', 'arrested', 'article', 'arundathi', 'arundati', 'arundhati', 'arundhuti', 'as', 'ashamed', 'ask', 'asked', 'asking', 'aspect', 'asshole', 'at', 'atleast', 'attitude', 'audience', 'aur', 'available', 'away', '

In [23]:
for i in range(1,4264):
    data.Text[i] = vectors[i-1]


# Creating a Linear SVM (Support Vector Machine) Model

# Sub Task A : Aggression

In [24]:
#SUB-TASK A

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(vectors, data.subTask_a, test_size=0.3,random_state=109) # 70% training and 30% test


In [25]:
print(X_train)

  (0, 1156)	0.8309622799757538
  (0, 993)	0.5563287600488552
  (1, 300)	0.3675136353542904
  (1, 958)	0.34781895545878316
  (1, 397)	0.31152446756555274
  (1, 1197)	0.2308576756350703
  (1, 102)	0.30440348862546596
  (1, 251)	0.2533001504501913
  (1, 32)	0.2598964464790244
  (1, 438)	0.1952843280300746
  (1, 873)	0.24109304034592433
  (1, 1199)	0.21382519934678001
  (1, 819)	0.24681412247601672
  (1, 1204)	0.22080236454090088
  (1, 1221)	0.13114491909124104
  (1, 1064)	0.2685760529578082
  (1, 697)	0.16810207834332702
  (2, 1335)	0.4395995232202946
  (2, 51)	0.32521607692531534
  (2, 1104)	0.37688009724138183
  (2, 504)	0.4272802989638965
  (2, 394)	0.37688009724138183
  (2, 481)	0.28819799632282866
  (2, 1275)	0.2543414780174179
  (2, 848)	0.1673474401302779
  :	:
  (2979, 1221)	0.28236731703242146
  (2979, 1219)	0.5013274987284275
  (2980, 648)	0.3727562759812046
  (2980, 1038)	0.38098922762116494
  (2980, 463)	0.3727562759812046
  (2980, 544)	0.40375045539832155
  (2980, 848)	0.1815

In [26]:


#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, list(y_train))

#Predict the response for test dataset
y_predict = clf.predict(X_test)

# Evaluating the Model

# Sub Task A: Aggression

In [27]:


print("FOR SUB TASK A : Aggression")


# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_predict,
                                          pos_label='NAG',
                                           average='weighted'))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_predict,
                                         pos_label='NAG',
                                           average='weighted'))

# Model F measure: what percentage of positive tuples are labelled as such?
print("F1 Score:",metrics.f1_score(y_test, y_predict,
                                         pos_label='NAG',
                                           average='weighted'))




FOR SUB TASK A : Aggression
Precision: 0.7075158410909658
Recall: 0.7928068803752932
F1 Score: 0.726869498373168




# Sub Task B : Gendered Language/Misogyny

In [28]:

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(vectors, data.subTask_b, test_size=0.3,random_state=109) # 70% training and 30% test


In [29]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, list(y_train))

#Predict the response for test dataset
y_predict = clf.predict(X_test)

In [30]:
print("FOR SUB TASK B : Misogyny",'\n')


# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_predict,
                                          pos_label='NGEN',
                                           average='weighted'))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_predict,
                                         pos_label='NGEN',
                                           average='weighted'))

# Model F measure: what percentage of positive tuples are labelled as such?
print("F1 Score:",metrics.f1_score(y_test, y_predict,
                                         pos_label='NGEN',
                                           average='weighted'))


FOR SUB TASK B : Misogyny 

Precision: 0.9331368580666962
Recall: 0.9421422986708365
F1 Score: 0.92686138770114




# Misogynistic Aggression detection after evaluating sub tasks

Making a column for misogynistic aggression we mark samples that are either overtly or covertly aggressive and gendered as 1 and others 0, making misogynctic aggression a binary classification

In [31]:
data.head()

Unnamed: 0,ID,Text,subTask_a,subTask_b,Misogynistic_Aggression
1,C45.451,"(0, 884)\t0.68386663612072\n (0, 827)\t0.72...",NAG,NGEN,0
2,C47.11,,NAG,NGEN,0
3,C33.79,"(0, 1207)\t0.22587968929675173\n (0, 697)\t...",NAG,NGEN,0
4,C4.1961,"(0, 231)\t0.1324166523389718\n (0, 851)\t0....",NAG,NGEN,0
5,C10.153,"(0, 814)\t0.3183299169685401\n (0, 65)\t0.2...",NAG,NGEN,0


In [32]:
data.tail(20)

Unnamed: 0,ID,Text,subTask_a,subTask_b,Misogynistic_Aggression
4244,C10.459,"(0, 7)\t0.40816368045704904\n (0, 319)\t0.2...",NAG,NGEN,0
4245,C4.2040,"(0, 1359)\t0.3513870586886501\n (0, 1325)\t...",NAG,NGEN,0
4246,C4.2229.1,"(0, 1067)\t0.33358227136676527\n (0, 701)\t...",OAG,NGEN,0
4247,C4.774.1,"(0, 1292)\t0.6750193966084665\n (0, 451)\t0...",CAG,NGEN,0
4248,C4.1633.2,"(0, 977)\t0.45556830038755314\n (0, 77)\t0....",NAG,NGEN,0
4249,C4.1936,"(0, 760)\t0.4893940341119295\n (0, 1300)\t0...",NAG,NGEN,0
4250,C59.1722,"(0, 829)\t1.0",NAG,NGEN,0
4251,C7.25,"(0, 613)\t0.3656381553311836\n (0, 140)\t0....",NAG,NGEN,0
4252,C4.313,"(0, 278)\t0.3984259319736755\n (0, 733)\t0....",NAG,NGEN,0
4253,C59.40,"(0, 168)\t0.8162442200858951\n (0, 829)\t0....",NAG,NGEN,0


In [33]:
#using model on third label

In [34]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(vectors, data.Misogynistic_Aggression, test_size=0.3,random_state=109) # 70% training and 30% test


In [35]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, list(y_train))

#Predict the response for test dataset
final_y_predict = clf.predict(X_test)

In [36]:
print("FOR MISOGYNISTIC AGGRESSION : Misogyny",'\n')


# Model Precision: what percentage of positive tuples are labeled as such?
# print("Precision:",metrics.precision_score(y_test, y_predict))
# Model Recall: what percentage of positive tuples are labelled as such?
#print("Recall:",metrics.recall_score(y_test, y_predict))

# Model F measure: what percentage of positive tuples are labelled as such?
#print("F1 Score:",metrics.f1_score(y_test, final_y_predict,
                                        # pos_label='Misogynistic_Aggression',
                                         #  average='binary'))

FOR MISOGYNISTIC AGGRESSION : Misogyny 

