In [2]:
import numpy as np
import pandas as pd
import csv

# Loading Dataset

In [3]:
# Run this cell when computing results for Validation Data

file = '/content/Hostile_Validate.xlsx'
test_df = pd.read_excel(file)

# Data Preparation into Pandas Dataframe for Model Input

def get_data(a):
  Unique_ID = list(a['Unique ID'])
  sentence = list(a['Post'])

  raw_data_train = {'UID':Unique_ID,'sentence': sentence}
  df = pd.DataFrame(raw_data_train, columns = ['UID','sentence'])
  return df

test_data  = get_data(test_df)

print(test_data[0:3])

   UID                                           sentence
0    2  भारतीय जनता पार्टी rss वाले इतने गिरे हुए हैं ...
1    6  हॉन्ग कॉन्ग में एक व्यक्ति में दोबारा कोरोना क...
2    8  अद्भुत   जो वामपंथी कहते है कि महाभारत का युद्...


In [25]:
# Run this cell when computing results for Test Data

file = '/content/Hostile_Hindi_Test.xlsx'
test_df = pd.read_excel(file, names = ['Unique ID','Post'])

# Data Preparation into Pandas Dataframe for Model Input

def get_data(a):
  Unique_ID = list(a['Unique ID'])
  sentence = list(a['Post'])

  raw_data_train = {'UID':Unique_ID,'sentence': sentence}
  df = pd.DataFrame(raw_data_train, columns = ['UID','sentence'])
  return df

test_data  = get_data(test_df)

print(test_data[0:3])

   UID                                           sentence
0    1  कीस की को रोजगार चाहिए फिर नहीं कहना रोजगार नह...
1    3  कोई भी कांग्रेसी  ऊंची छत पर  रेलवे लाइन पर  ऊ...
2    4  अंडरवर्ल्ड डॉन छोटा राजन के भाई को बीजेपी द्वा...


# Hostile ID Collection 

In [26]:
# Collecting Hostile IDs from Validation Data

data = test_data

hos_ids = []
for i in range(len(data)):
  id = data['UID'][i]
  hos_ids.append(id)

hos_ids = np.array(hos_ids, dtype=np.int)
np.save('Hostile_ID.npy',hos_ids)

# Predicted Labels Loading (Fake, Hate, Offensive, Defamation)

In [27]:
# Loading previously obtained Predicted Labels and Hostile IDs from Previous Cell

bin_lab = np.load('/content/Test_Labels_Coarse.npy', allow_pickle=True)
d_lab = np.load('/content/Pred_Defamation_Label.npy', allow_pickle=True)
f_lab = np.load('/content/Pred_Fake_Label.npy', allow_pickle=True)
h_lab = np.load('/content/Pred_Hate_Label.npy', allow_pickle=True)
o_lab = np.load('/content/Pred_Offensive_Label.npy', allow_pickle=True)
hos_ids = np.load('/content/Hostile_ID.npy', allow_pickle=True)

# Merging Labels to Asses Performance measurement

In [12]:
# Run Cell when Evaluating Validation Data

# Merging Predicted labels into a single numpy array
# Reference: [non_hostile,defamation,fake,hate,offensive]

predicted_labels = []

count = 0

for i in range(1,812):         # Rectified Line (Error: ID Mismatch)
  row = []
  if i not in hos_ids:
    row.append([1,0,0,0,0])
  else:
    alt_row = [0,0,0,0,0]
    if d_lab[count]==1:
      alt_row[1] = 1
    if f_lab[count]==1:
      alt_row[2] = 1
    if h_lab[count]==1:
      alt_row[3] = 1
    if o_lab[count]==1:
      alt_row[4] = 1
    count += 1
    row.append(alt_row)
  predicted_labels.append(row)

pred_lab = np.reshape(np.array(predicted_labels),(811,5)) # Final Predicted Labels
np.save('Pred_Validation_labels.npy',pred_lab)

In [28]:
# Run Cell when Evaluating Test Data

# Merging Predicted labels into a single numpy array
# Reference: [non_hostile,defamation,fake,hate,offensive]

predicted_labels = []

'''
for i in range(1653):           # ERROR LINE: where mapping got mismatched (Wrong Test File Submitted in Competition)
'''

count = 0

for i in range(1,1654):         # Rectified Line (Error: ID Mismatch)
  row = []
  if i not in hos_ids:
    row.append([1,0,0,0,0])
  else:
    alt_row = [0,0,0,0,0]
    if d_lab[count]==1:
      alt_row[1] = 1
    if f_lab[count]==1:
      alt_row[2] = 1
    if h_lab[count]==1:
      alt_row[3] = 1
    if o_lab[count]==1:
      alt_row[4] = 1
    count += 1
    row.append(alt_row)
  predicted_labels.append(row)

pred_lab = np.reshape(np.array(predicted_labels),(1653,5)) # Final Predicted Labels
np.save('Pred_Test_labels.npy',pred_lab)

In [14]:
# For Validation Data Results (Run this if Loading Validation Dataset in Data Loading Cell)
y_true = np.load('/content/True_Validation_Labels.npy', allow_pickle=True)       # Released by Organizers
y_pred = np.load('/content/Pred_Validation_labels.npy', allow_pickle=True)       # Created from our Models

In [30]:
# For Test Data Results (Run this if Loading Test Dataset in Data Loading Cell)
y_pred = np.load('/content/Pred_Test_labels.npy', allow_pickle=True)             # Created from our Models

# Creating Final Submission File

In [None]:
# Reference: [non_hostile,defamation,fake,hate,offensive]
labels = []

for i in range(y_pred.shape[0]):
  lab_text = []
  idx = np.argwhere(y_pred[i]>0)
  idx = idx.reshape(idx.shape[0],)
  if len(idx)==0:
    lab_text.append('defamation')
  else:
    for j in idx:
      if j==0:
        lab_text.append('non-hostile')
      if j==1:
        lab_text.append('defamation')
      if j==2:
        lab_text.append('fake')
      if j==3:
        lab_text.append('hate')
      if j==4:
        lab_text.append('offensive')
  labels.append(lab_text)


def final_submission(label_list):
  data = []
  titles = ['Unique ID','Labels Set']
  data.append(titles)
  for i in range(len(label_list)):
    row = []
    row.append(i+1)
    lab_text = ''
    for j in range(len(label_list[i])):
      lab_text += str(label_list[i][j])+','
    lab_text = lab_text[:-1]+''
    row.append(str(lab_text))
    data.append(row)

  file1 = "Results.csv"
  with open(file1, 'w') as csvfile:  
    csvwriter = csv.writer(csvfile)   
    csvwriter.writerows(data)

final_submission(labels)

# Final Test Dataset Result (Official Script)


In [32]:
### Order of Labels --> [Hostile,defamation,fake,hate,offensive,non-hostile]
### An example      --> [1,0,1,1,0,0]


import numpy as np
import pandas as pd
from sklearn.metrics import f1_score



def preprocess(df):
    
    df = df.dropna()
    
    df.insert(len(df.columns)-1,'Hostile', np.zeros(len(df),dtype=int))
    df.insert(len(df.columns)-1,'Defamation', np.zeros(len(df),dtype=int))
    df.insert(len(df.columns)-1,'Fake', np.zeros(len(df),dtype=int))
    df.insert(len(df.columns)-1,'Hate', np.zeros(len(df),dtype=int))
    df.insert(len(df.columns)-1,'Offensive', np.zeros(len(df),dtype=int))
    df.insert(len(df.columns)-1,'Non-Hostile', np.zeros(len(df),dtype=int))    
    
    for i in range(len(df)):
        text = df['Labels Set'][i]
        text = text.lower()
        text = text.replace('\n',"")
        text = text.replace('"',"")
        text = text.replace(" ","")
        text = text.split(',')


        for word in text:
            if word == 'defamation':
                df.at[i,'Hostile']    = 1
                df.at[i,'Defamation'] = 1
    
            if word == 'fake':
                df.at[i,'Hostile']    = 1
                df.at[i,'Fake'] = 1
    
            if word == 'hate':
                df.at[i,'Hostile']    = 1
                df.at[i,'Hate'] = 1
    
            if word == 'offensive':
                df.at[i,'Hostile']    = 1
                df.at[i,'Offensive'] = 1
    
            if word == 'non-hostile' and df['Hostile'][i]==0:
                df.at[i,'Hostile']    = 0
                df.at[i,'Non-Hostile'] = 1

    return df 
  
    



def get_scores(y_true, y_pred):
    
    hostility_true = y_true['Hostile']
    hostility_pred = y_pred['Hostile']
    
    hostility_f1 = f1_score(y_true=hostility_true, y_pred=hostility_pred, average='weighted')
    
    
    nh_indexes = y_true[y_true['Hostile']==0].index
    y_true = y_true.drop(nh_indexes)
    y_true = y_true.reset_index(drop=True)
    
    y_pred = y_pred.drop(nh_indexes)
    y_pred = y_pred.reset_index(drop=True)
    
    
    fine_true = y_true[['Defamation','Fake','Hate','Offensive']]
    fine_pred = y_pred[['Defamation','Fake','Hate','Offensive']]
    
    
    fine_f1          = f1_score(y_true=fine_true, y_pred=fine_pred, average=None)
    defame_f1        = fine_f1[0]
    fake_f1          = fine_f1[1]
    hate_f1          = fine_f1[2]
    offensive_f1     = fine_f1[3]
    weighted_fine_f1 = f1_score(y_true=fine_true, y_pred=fine_pred, average='weighted')

    return [hostility_f1, defame_f1, fake_f1, hate_f1, offensive_f1, weighted_fine_f1]




ground_truth_path      = "/content/Ground_Truth.csv"                            # Load Test or Validation Ground Truth
submission_file_path   = "/content/Results.csv"                                 # Load Test or Validation Submission.csv
 

try:  
    y_true = pd.read_csv(ground_truth_path)
    y_pred = pd.read_csv(submission_file_path)
    
    y_true = preprocess(y_true)
    y_pred = preprocess(y_pred)
    
    team_score = get_scores(y_true,y_pred)
    
    
except:
    team_score = [0,0,0,0,0,0]
    
        
print("Coarse Grained F1-score: ", team_score[0])
print("Defamation F1-score:     ", team_score[1])
print("Fake F1-score:           ", team_score[2])
print("Hate F1-score:           ", team_score[3])
print("Offensive F1-score:      ", team_score[4])
print("Fine Grained F1-score:   ", team_score[5])

Coarse Grained F1-score:  0.9059207129656424
Defamation F1-score:      0.3937007874015748
Fake F1-score:            0.6330935251798562
Hate F1-score:            0.4724061810154525
Offensive F1-score:       0.5555555555555556
Fine Grained F1-score:    0.5336803174740802


# Important Note

In the Test Submission Phase, we submitted the same file but the IDs got mismatched (due to the wrong oneline marked in the previous cell as ERROR LINE)
We rectified this in the post test phase. (Next line after ERROR LINE)