In [1]:
import re

def d_clean(string):
    s = string
    for c in '\\=@-,\'".!:;<>/{}[]()#^?':
        s = s.replace(c, '_')
    s = s.replace('$', '_dollars')
    s = s.replace('%', '_percent')
    s = s.replace('|', ' ')
    s = s.replace('*', ' ')
    if s == '#':
        s = '_number'
    keywords = ("graph", "node", "strict", "edge")
    if re.match('^[0-9]', s) or s in keywords:
        s = "X" + s
        
    if not s:
        return "None"
    return s

def to_dots(graphs, marked_nodes=set(), integ=False):
    lines = [u'digraph finite_state_machine {', '\tdpi=70;']
    # lines.append('\tordering=out;')
    # sorting everything to make the process deterministic
    for i, graph in enumerate(graphs):
        s = "subgraph cluster_" + chr(ord('@')+i+1) + " {"
        node_lines = []

        node_lines.append(s)
        node_to_name = {}
        for node, n_data in graph.nodes(data=True):
            if integ:
                d_node = d_clean(str(node))
            else:    
                d_node = d_clean(n_data["name"])
            printname = d_node
            node_to_name[node] = printname
            if 'expanded' in n_data and n_data['expanded'] and printname in marked_nodes:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style=filled, fillcolor=purple];'.format(
                    d_node, printname).replace('-', '_')
            elif 'expanded' in n_data and n_data['expanded']:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style="filled"];'.format(
                    d_node, printname).replace('-', '_')
            elif 'fourlang' in n_data and n_data['fourlang']:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style="filled", fillcolor=red];'.format(
                    d_node, printname).replace('-', '_')
            elif 'substituted' in n_data and n_data['substituted']:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style="filled"];'.format(
                    d_node, printname).replace('-', '_')
            elif printname in marked_nodes:
                node_line = u'\t{0} [shape = circle, label = "{1}", style=filled, fillcolor=lightblue];'.format(
                    d_node, printname).replace('-', '_')
            else:
                node_line = u'\t{0} [shape = circle, label = "{1}"];'.format(
                    d_node, printname).replace('-', '_')
            node_lines.append(node_line)
        lines += sorted(node_lines)

        edge_lines = []
        for u, v, edata in graph.edges(data=True):
            if 'color' in edata:
                d_node1 = node_to_name[u]
                d_node2 = node_to_name[v]
                edge_lines.append(
                    u'\t{0} -> {1} [ label = "{2}" ];'.format(d_node1, d_node2, edata['color']))

        lines += sorted(edge_lines)
        lines.append('}')
    lines.append('}')
    return u'\n'.join(lines)

def to_dot(graph, marked_nodes=set(), integ=False):
    lines = [u'digraph finite_state_machine {', '\tdpi=70;']
    # lines.append('\tordering=out;')
    # sorting everything to make the process deterministic
    node_lines = []
    node_to_name = {}
    for node, n_data in graph.nodes(data=True):
        if integ:
            d_node = d_clean(str(node))
        else:    
            d_node = d_clean(n_data["name"])
        printname = d_node
        node_to_name[node] = printname
        if 'expanded' in n_data and n_data['expanded'] and printname in marked_nodes:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style=filled, fillcolor=purple];'.format(
                d_node, printname).replace('-', '_')
        elif 'expanded' in n_data and n_data['expanded']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled"];'.format(
                d_node, printname).replace('-', '_')
        elif 'fourlang' in n_data and n_data['fourlang']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled", fillcolor=red];'.format(
                d_node, printname).replace('-', '_')
        elif 'substituted' in n_data and n_data['substituted']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled"];'.format(
                d_node, printname).replace('-', '_')
        elif printname in marked_nodes:
            node_line = u'\t{0} [shape = circle, label = "{1}", style=filled, fillcolor=lightblue];'.format(
                d_node, printname).replace('-', '_')
        else:
            node_line = u'\t{0} [shape = circle, label = "{1}"];'.format(
                d_node, printname).replace('-', '_')
        node_lines.append(node_line)
    lines += sorted(node_lines)

    edge_lines = []
    for u, v, edata in graph.edges(data=True):
        if 'color' in edata:
            d_node1 = node_to_name[u]
            d_node2 = node_to_name[v]
            edge_lines.append(
                u'\t{0} -> {1} [ label = "{2}" ];'.format(d_node1, d_node2, edata['color']))

    lines += sorted(edge_lines)
    lines.append('}')
    return u'\n'.join(lines)

In [2]:
from exprel.dataset.hasoc_dataset import HasocDataset
from exprel.models.utils import tree_to_code
from dotenv import load_dotenv 
import pandas as pd
load_dotenv()

True

In [12]:
df_train = pd.read_csv("/home/kovacs/projects/exp-relation-extraction/data/hasoc_2021_train_normalized.csv", delimiter="\t")
df_test = pd.read_csv("/home/kovacs/projects/exp-relation-extraction/data/hasoc_2021_test_normalized.csv", delimiter="\t")
train_data = HasocDataset(df_train)
test_data = HasocDataset(df_test)

2021-08-27 09:46:29 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-08-27 09:46:29 INFO: Use device: cpu
2021-08-27 09:46:29 INFO: Loading: tokenize
2021-08-27 09:46:29 INFO: Loading: pos
2021-08-27 09:46:30 INFO: Loading: lemma
2021-08-27 09:46:30 INFO: Loading: depparse
2021-08-27 09:46:32 INFO: Loading: sentiment
2021-08-27 09:46:34 INFO: Loading: ner
2021-08-27 09:46:36 INFO: Done loading processors!
3843it [00:00, 4040.43it/s]
2021-08-27 09:46:47 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-08-27 09:46:47 INFO: Use device: cpu
2021-08-27 09:46:

In [13]:
import pandas as pd
pd.options.display.max_colwidth = 200

In [14]:
from exprel.feature_extractor.extract import FeatureExtractor
from exprel.models.model import GraphModel

extractor = FeatureExtractor(lang="en", cache_fn="en_nlp_cache")
model = GraphModel()
test_model = GraphModel()

In [15]:
train_data.load_graphs("/home/kovacs/projects/exp-relation-extraction/notebooks/graphs/hasoc2021_train_amr.pickle")
test_data.load_graphs("/home/kovacs/projects/exp-relation-extraction/notebooks/graphs/hasoc2021_test_amr.pickle")
#graphs = data.parse_graphs(extractor, format="fourlang")

In [16]:
df_train = train_data.to_dataframe()
df_test = test_data.to_dataframe()

In [25]:
df_train

Unnamed: 0,hasoc_id,original_text,preprocessed_text,task1,task2,task1_id,task2_id,graph
0,0,"[USER] if you made it through this && were not only able to start making money for yourself but sustain living that way all from home, fuck these companies & corporate pigs. power to the people, a...","[USER] if you made it through this && were not only able to start making money for yourself but sustain living that way all from home, fuck these companies & corporate pigs. power to the people, a...",HOF,PRFN,0,3,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27)"
1,1,"Technically that's still turning back the clock, dick head [URL]","Technically that's still turning back the clock, dick head [URL]",HOF,OFFN,0,2,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)"
2,2,"[USER] [USER] [USER] [USER] [USER] [USER] [USER] [USER] And you're the govt?!?! Stop thinking about world media, liberal gangs or any optics whatsoever and ACT NOW already. If this is what a perso...","[USER] [USER] [USER] [USER] [USER] [USER] [USER] [USER] And you're the govt?!?! Stop thinking about world media, liberal gangs or any optics whatsoever and ACT NOW already. If this is what a perso...",NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28)"
3,3,[USER] Soldier of Japan Who has dick head,[USER] Soldier of Japan Who has dick head,HOF,OFFN,0,2,"(0, 1, 2, 3, 4, 5, 6, 7, 8)"
4,4,[USER] You'd be better off asking who DOESN'T think he's a sleazy shitbag lmao.,[USER] You'd be better off asking who DOESN'T think he's a sleazy shitbag lmao.,HOF,OFFN,0,2,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)"
...,...,...,...,...,...,...,...,...
3838,3838,[USER] Let the dog deal with the wanker once he's un armed ..,[USER] Let the dog deal with the wanker once he's un armed ..,HOF,PRFN,0,3,"(0, 1, 2, 3, 4, 5, 6, 7)"
3839,3839,India has suffered a lot. That Chinese bastard should pay the price. [USER] ChineseVirus,India has suffered a lot. That Chinese bastard should pay the price. [USER] ChineseVirus,HOF,HATE,0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)"
3840,3840,"People didn't give 300+ seats majority to BJP to see BengalBurning ... If they can't fix this, they hv no right to continue in office... Don't take our votes for granted [USER] ...","People didn't give 300+ seats majority to BJP to see BengalBurning ... If they can't fix this, they hv no right to continue in office... Don't take our votes for granted [USER] ...",HOF,HATE,0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)"
3841,3841,"[USER] This is such a vile, xenophobic and uneducated comment... Ia[CUR]™m struggling to believe someone thinks like this, let alone posted this?! Daylight Islamophobia and it should be stopped. E...","[USER] This is such a vile, xenophobic and uneducated comment... Ia[CUR]™m struggling to believe someone thinks like this, let alone posted this?! Daylight Islamophobia and it should be stopped. E...",HOF,PRFN,0,3,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24)"


In [18]:
import pandas as pd

ids = pd.to_numeric(df_train.index).tolist()
sentences = df_train.preprocessed_text.tolist()
labels = df_train.task2_id.tolist()
postprocessed_graphs = df_train.graph.tolist()

In [19]:
from tqdm import tqdm

for ind, graph, label in tqdm(zip(ids, postprocessed_graphs, labels)):
    model.featurize_sen_graph(ind, graph, label, 2)

3843it [01:39, 38.45it/s]


In [20]:
import pandas as pd

test_ids = pd.to_numeric(df_test.index).tolist()
test_sentences = df_test.preprocessed_text.tolist()
test_labels = df_test.task2_id.tolist()
test_postprocessed_graphs = df_test.graph.tolist()

for ind, graph, label in tqdm(zip(test_ids, test_postprocessed_graphs, test_labels)):
    test_model.featurize_sen_graph(ind, graph, label, 2)

1281it [00:34, 36.62it/s]


In [228]:
model.vocab_size

2500

In [21]:
feature_graphs = model.get_feature_graphs()
test_feature_graphs = test_model.get_feature_graphs()

In [22]:
model.select_n_best(2500)
test_model.select_n_best(2500)

In [23]:
label_vocab = {"NONE": 0, "PRFN": 1, "OFFN": 2, "HATE": 3}

In [26]:
X, Y = model.get_x_y(df_train.task2, label_vocab=label_vocab)

In [28]:
test_X, _ = test_model.get_x_y(df_test.task2, label_vocab = {None: 0})

In [30]:
from sklearn.model_selection import train_test_split as split

tr_data,tst_data,tr_labels,tst_labels = split(X,Y, test_size=0.2, random_state=1234)

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
#clf = LogisticRegression(random_state=0).fit(tr_data, tr_labels)
clf = OneVsRestClassifier(RandomForestClassifier(random_state=0, class_weight="balanced_subsample")).fit(tr_data, tr_labels)

In [75]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
keys = ["NONE", "PRFN", "OFFN", "HATE"]
labels_to_result = {}
lr_pred = clf.predict(tst_data)
#prf = precision_recall_fscore_support(tst_labels, lr_pred, average=None)
print(classification_report(tst_labels, lr_pred, target_names=keys, output_dict=False))

              precision    recall  f1-score   support

        NONE       0.65      0.64      0.65       277
        PRFN       0.65      0.84      0.73       246
        OFFN       0.41      0.21      0.28       123
        HATE       0.44      0.42      0.43       123

    accuracy                           0.60       769
   macro avg       0.54      0.53      0.52       769
weighted avg       0.58      0.60      0.58       769



In [33]:
feature_graph_strings = model.get_feature_graph_strings()

In [86]:
import eli5

In [87]:
weights_df = eli5.explain_weights_df(clf)

In [88]:
weights_df

Unnamed: 0,target,feature,weight
0,0.0,x313,1.128793
1,0.0,<BIAS>,1.128223
2,0.0,x125,1.095476
3,0.0,x1933,1.055855
4,0.0,x110,1.035189
...,...,...,...
7999,3.0,x345,-0.778599
8000,3.0,x739,-0.884232
8001,3.0,x1697,-0.895375
8002,3.0,x56,-0.906720


In [43]:
list(keys)

['HOF', 'NOT']

In [89]:
from collections import defaultdict
features = defaultdict(list)

for target in weights_df.target.unique():
    targeted_df = weights_df[weights_df.target == target]
    most_important_weights = targeted_df.iloc[:5].feature.str.strip("x").tolist()
    for i in most_important_weights:
        if i != "<BIAS>":
            g_nx = feature_graphs[model.inverse_relabel[int(i)]]
            #if len(g_nx.edges()):
            g = feature_graph_strings[model.inverse_relabel[int(i)]]
            features[list(keys)[int(target)]].append(([g], [], {v: k for k, v in label_vocab.items()}[int(target)]))

In [148]:
#RandomForest
from collections import defaultdict
features = defaultdict(list)

for j, est in enumerate(clf.estimators_):
    weights_df = eli5.explain_weights_df(est)
    most_important_weights = weights_df.iloc[:5].feature.str.strip("x").tolist()
    for i in most_important_weights:
        if i != "<BIAS>":
            g_nx = feature_graphs[model.inverse_relabel[int(i)]]
            #if len(g_nx.edges()):
            g = feature_graph_strings[model.inverse_relabel[int(i)]]
            features[list(keys)[j]].append(([g], [], model.label_vocab.id_to_word[j]))

AttributeError: 'LogisticRegression' object has no attribute 'estimators_'

In [104]:
features

defaultdict(list,
            {'NONE': [(['(u_874 / fight)'], [], 'NONE'),
              (['(u_256 / PLUS)'], [], 'NONE'),
              (['(u_2246 / in)'], [], 'NONE'),
              (['(u_148 / date)'], [], 'NONE')],
             'PRFN': [(['(u_2 / fuck)'], [], 'PRFN'),
              (['(u_859 / motherfucker)'], [], 'PRFN'),
              (['(u_764 / whore)'], [], 'PRFN'),
              (['(u_58 / dick)'], [], 'PRFN')],
             'OFFN': [(['(u_218 / cunt)'], [], 'OFFN'),
              (['(u_1132 / fucking)'], [], 'OFFN'),
              (['(u_333 / hyperlink  :ARG1 (u_0 / multi))'], [], 'OFFN'),
              (['(u_1071 / woman)'], [], 'OFFN'),
              (['(u_1216 / idiot)'], [], 'OFFN')],
             'HATE': [(['(u_511 / hate  :ARG0 (u_25 / i))'], [], 'HATE'),
              (['(u_545 / racist)'], [], 'HATE'),
              (['(u_0 / multi  :snt1 (u_10 / possible))'], [], 'HATE'),
              (['(u_722 / whole)'], [], 'HATE'),
              (['(u_701 / islamophobia)'], [],

In [91]:
import json

with open("2021_train_features_task2.json", "w+") as f:
    json.dump(features, f)

In [37]:
train, val = split(df_train, test_size=0.2, random_state=1234) 

In [38]:
train

Unnamed: 0,hasoc_id,original_text,preprocessed_text,task1,task2,task1_id,task2_id,graph
263,263,[USER] I got a sticker when I got my CovidVaccine I stuck it on my calendar to memorialise it. [USER] [USER] [URL],[USER] I got a sticker when I got my CovidVaccine I stuck it on my calendar to memorialise it. [USER] [USER] [URL],NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)"
988,988,"[USER] [USER] [USER] Those who invented the WuhanVirus ,helped it to spread across the world and killed millions of people shouldn't talk about it.We know your country also suffered a lot.The diff...","[USER] [USER] [USER] Those who invented the WuhanVirus ,helped it to spread across the world and killed millions of people shouldn't talk about it.We know your country also suffered a lot.The diff...",NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)"
641,641,ResignModi resinemodi Resign_PM_Modi Stop playing with our lives!! 🤧😷😷 [URL],ResignModi resinemodi Resign_PM_Modi Stop playing with our lives!! 🤧😷😷 [URL],HOF,HATE,0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)"
3637,3637,"BengalInFlames BengalBurning BengalViolence Bengalisburning Mamta Banerjee killing people who made a choice against her & raised Jai shree Ram slogan. Not just BJP, all her opponents are maimed. P...","BengalInFlames BengalBurning BengalViolence Bengalisburning Mamta Banerjee killing people who made a choice against her & raised Jai shree Ram slogan. Not just BJP, all her opponents are maimed. P...",HOF,OFFN,0,2,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)"
195,195,How do the citizens make Big Corporations and Governments Accountable for misusing their Powers in evil ways and yet they continue and get away with all they do ? Where does this damn thing end ? ...,How do the citizens make Big Corporations and Governments Accountable for misusing their Powers in evil ways and yet they continue and get away with all they do ? Where does this damn thing end ? ...,NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)"
...,...,...,...,...,...,...,...,...
3276,3276,Been single for 5 years now. I wanna date but this dating shit some huff,Been single for 5 years now. I wanna date but this dating shit some huff,NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)"
3125,3125,First it was ChinaVirus which made the world suffer terribly and now its ChinaRocket which can land anywhere any time. China is becoming a danger to human existence.,First it was ChinaVirus which made the world suffer terribly and now its ChinaRocket which can land anywhere any time. China is becoming a danger to human existence.,HOF,HATE,0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)"
1318,1318,"Poor administration by govt. lead us to this cruel situation right now. ""In india people are not die because of COVID19 they all are dieing because of not getting proper treatment at right time, t...","Poor administration by govt. lead us to this cruel situation right now. ""In india people are not die because of COVID19 they all are dieing because of not getting proper treatment at right time, t...",HOF,HATE,0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)"
723,723,"My son watching Donkey Hodie reminds me of my childhood hatred for this character. Just look at it. Someone designed this thing and said ""Yes, this is appropriate for small baby children to look a...","My son watching Donkey Hodie reminds me of my childhood hatred for this character. Just look at it. Someone designed this thing and said ""Yes, this is appropriate for small baby children to look a...",NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)"


In [39]:
train = train.rename(columns={'preprocessed_text': 'sentence', 'task2': 'label'})
val = val.rename(columns={'preprocessed_text': 'sentence', 'task2': 'label'})

In [276]:
train

Unnamed: 0,hasoc_id,original_text,sentence,task1,label,task1_id,task2_id,graph
263,263,[USER] I got a sticker when I got my CovidVaccine I stuck it on my calendar to memorialise it. [USER] [USER] [URL],[USER] I got a sticker when I got my CovidVaccine I stuck it on my calendar to memorialise it. [USER] [USER] [URL],NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)"
988,988,"[USER] [USER] [USER] Those who invented the WuhanVirus ,helped it to spread across the world and killed millions of people shouldn't talk about it.We know your country also suffered a lot.The diff...","[USER] [USER] [USER] Those who invented the WuhanVirus ,helped it to spread across the world and killed millions of people shouldn't talk about it.We know your country also suffered a lot.The diff...",NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)"
641,641,ResignModi resinemodi Resign_PM_Modi Stop playing with our lives!! 🤧😷😷 [URL],ResignModi resinemodi Resign_PM_Modi Stop playing with our lives!! 🤧😷😷 [URL],HOF,HATE,0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)"
3637,3637,"BengalInFlames BengalBurning BengalViolence Bengalisburning Mamta Banerjee killing people who made a choice against her & raised Jai shree Ram slogan. Not just BJP, all her opponents are maimed. P...","BengalInFlames BengalBurning BengalViolence Bengalisburning Mamta Banerjee killing people who made a choice against her & raised Jai shree Ram slogan. Not just BJP, all her opponents are maimed. P...",HOF,OFFN,0,2,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)"
195,195,How do the citizens make Big Corporations and Governments Accountable for misusing their Powers in evil ways and yet they continue and get away with all they do ? Where does this damn thing end ? ...,How do the citizens make Big Corporations and Governments Accountable for misusing their Powers in evil ways and yet they continue and get away with all they do ? Where does this damn thing end ? ...,NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)"
...,...,...,...,...,...,...,...,...
3276,3276,Been single for 5 years now. I wanna date but this dating shit some huff,Been single for 5 years now. I wanna date but this dating shit some huff,NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)"
3125,3125,First it was ChinaVirus which made the world suffer terribly and now its ChinaRocket which can land anywhere any time. China is becoming a danger to human existence.,First it was ChinaVirus which made the world suffer terribly and now its ChinaRocket which can land anywhere any time. China is becoming a danger to human existence.,HOF,HATE,0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)"
1318,1318,"Poor administration by govt. lead us to this cruel situation right now. ""In india people are not die because of COVID19 they all are dieing because of not getting proper treatment at right time, t...","Poor administration by govt. lead us to this cruel situation right now. ""In india people are not die because of COVID19 they all are dieing because of not getting proper treatment at right time, t...",HOF,HATE,0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)"
723,723,"My son watching Donkey Hodie reminds me of my childhood hatred for this character. Just look at it. Someone designed this thing and said ""Yes, this is appropriate for small baby children to look a...","My son watching Donkey Hodie reminds me of my childhood hatred for this character. Just look at it. Someone designed this thing and said ""Yes, this is appropriate for small baby children to look a...",NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)"


In [277]:
train.to_pickle("train_dataset")
val.to_pickle("val_dataset")

In [103]:
df

Unnamed: 0,hasoc_id,original_text,preprocessed_text,task1,task2,task1_id,task2_id,graph
0,0,"[USER] if you made it through this && were not only able to start making money for yourself but sustain living that way all from home, fuck these companies & corporate pigs. power to the people, a...","[USER] if you made it through this && were not only able to start making money for yourself but sustain living that way all from home, fuck these companies & corporate pigs. power to the people, a...",HOF,PRFN,0,3,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27)"
1,1,"Technically that's still turning back the clock, dick head [URL]","Technically that's still turning back the clock, dick head [URL]",HOF,OFFN,0,2,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)"
2,2,"[USER] [USER] [USER] [USER] [USER] [USER] [USER] [USER] And you're the govt?!?! Stop thinking about world media, liberal gangs or any optics whatsoever and ACT NOW already. If this is what a perso...","[USER] [USER] [USER] [USER] [USER] [USER] [USER] [USER] And you're the govt?!?! Stop thinking about world media, liberal gangs or any optics whatsoever and ACT NOW already. If this is what a perso...",NOT,NONE,1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28)"
3,3,[USER] Soldier of Japan Who has dick head,[USER] Soldier of Japan Who has dick head,HOF,OFFN,0,2,"(0, 1, 2, 3, 4, 5, 6, 7, 8)"
4,4,[USER] You'd be better off asking who DOESN'T think he's a sleazy shitbag lmao.,[USER] You'd be better off asking who DOESN'T think he's a sleazy shitbag lmao.,HOF,OFFN,0,2,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)"
...,...,...,...,...,...,...,...,...
3838,3838,[USER] Let the dog deal with the wanker once he's un armed ..,[USER] Let the dog deal with the wanker once he's un armed ..,HOF,PRFN,0,3,"(0, 1, 2, 3, 4, 5, 6, 7)"
3839,3839,India has suffered a lot. That Chinese bastard should pay the price. [USER] ChineseVirus,India has suffered a lot. That Chinese bastard should pay the price. [USER] ChineseVirus,HOF,HATE,0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)"
3840,3840,"People didn't give 300+ seats majority to BJP to see BengalBurning ... If they can't fix this, they hv no right to continue in office... Don't take our votes for granted [USER] ...","People didn't give 300+ seats majority to BJP to see BengalBurning ... If they can't fix this, they hv no right to continue in office... Don't take our votes for granted [USER] ...",HOF,HATE,0,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)"
3841,3841,"[USER] This is such a vile, xenophobic and uneducated comment... Ia[CUR]™m struggling to believe someone thinks like this, let alone posted this?! Daylight Islamophobia and it should be stopped. E...","[USER] This is such a vile, xenophobic and uneducated comment... Ia[CUR]™m struggling to believe someone thinks like this, let alone posted this?! Daylight Islamophobia and it should be stopped. E...",HOF,PRFN,0,3,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24)"


## Simple Ngram model

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=2500, stop_words="english", lowercase=True, ngram_range=(1,3))

In [41]:
X = vectorizer.fit(train.sentence)

In [42]:
X_train = X.transform(train.sentence)
X_val = X.transform(val.sentence)

In [43]:
clf2 = OneVsRestClassifier(RandomForestClassifier(random_state=0, class_weight="balanced_subsample")).fit(X_train, tr_labels)

In [44]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
keys = ["NONE", "PRFN", "OFFN", "HATE"]
labels_to_result = {}
lr_pred2 = clf2.predict(X_val)
print(classification_report(tst_labels, lr_pred2, target_names=keys, output_dict=False))

              precision    recall  f1-score   support

        NONE       0.67      0.70      0.68       277
        PRFN       0.70      0.85      0.77       246
        OFFN       0.44      0.28      0.34       123
        HATE       0.48      0.41      0.44       123

    accuracy                           0.63       769
   macro avg       0.57      0.56      0.56       769
weighted avg       0.61      0.63      0.62       769



In [45]:
ngram = clf2.predict_proba(X_val)

In [46]:
amr = clf.predict_proba(tst_data)

In [47]:
soft_voted = ngram + amr

In [48]:
import numpy as np
preds = np.argmax(soft_voted, axis=1)

In [49]:
print(classification_report(tst_labels, preds, target_names=keys, output_dict=False))

              precision    recall  f1-score   support

        NONE       0.67      0.70      0.69       277
        PRFN       0.70      0.89      0.78       246
        OFFN       0.42      0.22      0.29       123
        HATE       0.50      0.41      0.45       123

    accuracy                           0.64       769
   macro avg       0.57      0.56      0.55       769
weighted avg       0.61      0.64      0.62       769



In [55]:
amr_predict = clf.predict(test_X)

In [60]:
ngram_predict = clf2.predict(X.transform(df_test.preprocessed_text))

In [61]:
soft_voted = clf2.predict_proba(X.transform(df_test.preprocessed_text)) + clf.predict_proba(test_X)

In [62]:
import numpy as np
test_preds = np.argmax(soft_voted, axis=1)

In [67]:
inverse_vocab = {v: k for k, v in label_vocab.items()}

In [70]:
test_predictions = pd.DataFrame({"sentence": df_test.preprocessed_text, "graph_pred": [inverse_vocab[i] for i in amr_predict], "ngram_pred": [inverse_vocab[i] for i in ngram_predict], "soft_vote": [inverse_vocab[i] for i in test_preds]})

In [72]:
test_predictions.to_csv("2021_hasoc_test_taskB.csv", sep='\t')

In [83]:
rule_labels = []
with open("2021_rule_labels") as f:
    for line in f:
        rule_labels.append(line.strip("\n"))

In [84]:
val_proba = clf2.predict_proba(X_val)

In [103]:
rule_argmax = []
for i, proba in enumerate(val_proba):
    L = np.argsort(-proba)
    if L[0] == 0 and rule_labels[i] == "HOF":
        p
        rule_argmax.append(L[1])
    else:
        rule_argmax.append(L[0])

In [104]:
print(classification_report(tst_labels, rule_argmax, target_names=keys, output_dict=False))

              precision    recall  f1-score   support

        NONE       0.67      0.70      0.68       277
        PRFN       0.70      0.85      0.77       246
        OFFN       0.42      0.27      0.33       123
        HATE       0.50      0.41      0.45       123

    accuracy                           0.63       769
   macro avg       0.57      0.56      0.56       769
weighted avg       0.61      0.63      0.62       769

