# Aggression Experiments

In [None]:
! pip install transformers -qqq

[K     |████████████████████████████████| 2.2MB 7.7MB/s 
[K     |████████████████████████████████| 870kB 41.3MB/s 
[K     |████████████████████████████████| 3.3MB 28.8MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
def create_aug_pipeline(model_name):
  return pipeline()

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics, ensemble, svm, feature_extraction, naive_bayes, neural_network
import xgboost

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/Dutta-SD/NLP/master/Aggression_Detection/trac2_eng_train.csv')
val = pd.read_csv('https://raw.githubusercontent.com/Dutta-SD/NLP/master/Aggression_Detection/trac2_eng_dev.csv')
train.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
0,C45.451,Next part,NAG,NGEN
1,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN
2,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN
3,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN
4,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN


In [None]:
def seed_all():
  np.random.seed(0)
  
seed_all()

In [None]:
def get_clean_dataset(
    df_raw,
    train = True,
    task_name='A', 
    other_target = 'B',
    target_mapping = None,
    vectorizer=None,
    text_cleaner=None):
  '''
  ===============================================================
  get_clean_dataset - cleans the dataset, returns text and labels
  ===============================================================

  :df_raw - pandas dataframe for cleaning
  :train - flag to see if training data sent or not
  :task_name - the target to predict
  :other_target - other target column, we predict only one target column at a time
  :vectorizer - vectorizes the data
  :text_cleaner - useful for removing punctuation, etc(function)
  '''
  # Compulsory
  assert target_mapping is not None, "NO TARGET MAPPING FOUND"

  col_str = f'Sub-task {task_name}'
  other_col = f'Sub-task {other_target}'

  if 'ID' in df_raw.columns:
    df_raw = df_raw.drop(['ID'], axis = 1)

  targets = df_raw[col_str].map(target_mapping).values
  text = df_raw['Text'].values

  if text_cleaner:
    text = text_cleaner(text)

  if vectorizer:
    if train:
      text = vectorizer.fit_transform(text)
    else:
      text = vectorizer.transform(text)
  

  return text, targets

In [None]:
s = np.bincount(train[1])

In [None]:
task_1_map ={
    'NAG' : 0,
    'CAG' : 1,
    'OAG' : 2
}
v1 = feature_extraction.text.CountVectorizer()
v2 = feature_extraction.text.TfidfVectorizer()

train_clean = get_clean_dataset(train, True,'A','B', task_1_map, vectorizer=v2)
val_clean = get_clean_dataset(val, False, 'A', 'B', task_1_map, vectorizer=v2)

weights = {i : s.sum() / s[i] for i in range(3)}

model1 = ensemble.RandomForestClassifier(class_weight=weights, random_state=0, criterion='entropy')
model2 = svm.LinearSVC(class_weight=weights, random_state=0)
model3 = xgboost.XGBClassifier(scale_pos_weight=weights, random_state=0)
model4 = neural_network.MLPClassifier(random_state=0, verbose=True, learning_rate='adaptive',max_iter=5 )

model_list = [model1, model2, model3, model4]

print(f"Vectorizer used : {type(v2).__name__}", end="\n\n")

for i, model in enumerate(model_list):
  print(f"model no {i}, training")
  print(f"model name {type(model).__name__}")
  preds = model.fit(train_clean[0], train_clean[1]).predict(val_clean[0])
  true_preds = val_clean[1]
  print(metrics.classification_report(true_preds, preds))
  print(f"\nDone with model {i}")

Vectorizer used : TfidfVectorizer

model no 0, training
model name RandomForestClassifier
              precision    recall  f1-score   support

           0       0.80      1.00      0.89       836
           1       1.00      0.01      0.02       117
           2       0.88      0.12      0.22       113

    accuracy                           0.80      1066
   macro avg       0.89      0.38      0.37      1066
weighted avg       0.83      0.80      0.72      1066


Done with model 0
model no 1, training
model name LinearSVC
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       836
           1       0.40      0.37      0.38       117
           2       0.55      0.50      0.53       113

    accuracy                           0.80      1066
   macro avg       0.61      0.59      0.60      1066
weighted avg       0.80      0.80      0.80      1066


Done with model 1
model no 2, training
model name XGBClassifier
              precisio



In [None]:
task_2_map ={
    'NGEN' : 0,
    'GEN' : 1,
}
v1 = feature_extraction.text.CountVectorizer()
v2 = feature_extraction.text.TfidfVectorizer()

train_clean = get_clean_dataset(train, True,'B','A', task_2_map, vectorizer=v2)
val_clean = get_clean_dataset(val, False, 'B', 'A', task_2_map, vectorizer=v2)

# print(train_clean[1][1:10])
s_2 = np.bincount(train_clean[1])
# print(s_2)

weights_2 = {i : s_2.sum() / s_2[i] for i in range(2)}
w = s_2[0]/s_2[1]

model1 = ensemble.RandomForestClassifier(class_weight=weights_2, random_state=0, criterion='entropy')
model2 = svm.LinearSVC(class_weight=weights_2, random_state=0)
model3 = xgboost.XGBClassifier(scale_pos_weight=w, random_state=0)
model4 = neural_network.MLPClassifier(random_state=0, verbose=True, learning_rate='adaptive',max_iter=5 )

model_list = [model1, model2, model3, model4]

print(f"Vectorizer used : {type(v2).__name__}", end="\n\n")

for i, model in enumerate(model_list):
  print(f"model no {i}, training")
  print(f"model name {type(model).__name__}")
  preds = model.fit(train_clean[0], train_clean[1]).predict(val_clean[0])
  true_preds = val_clean[1]
  print(metrics.classification_report(true_preds, preds))
  print(f"\nDone with model {i}")

Vectorizer used : TfidfVectorizer

model no 0, training
model name RandomForestClassifier
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       993
           1       0.20      0.01      0.03        73

    accuracy                           0.93      1066
   macro avg       0.57      0.50      0.49      1066
weighted avg       0.88      0.93      0.90      1066


Done with model 0
model no 1, training
model name LinearSVC
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       993
           1       0.52      0.44      0.47        73

    accuracy                           0.93      1066
   macro avg       0.74      0.70      0.72      1066
weighted avg       0.93      0.93      0.93      1066


Done with model 1
model no 2, training
model name XGBClassifier
              precision    recall  f1-score   support

           0       0.97      0.90      0.93       993
           1       0.

  _warn_prf(average, modifier, msg_start, len(result))
