<a href="https://colab.research.google.com/github/Dutta-SD/NLP/blob/master/Aggression_Detection/Experiments/Agg_and_Misgoyny_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aggression Experiments

Aggression_Detection/Experiments/

In [1]:
import pandas as pd
import numpy as np
from sklearn import (
    metrics, 
    ensemble, 
    feature_extraction, 
    pipeline,
    preprocessing,
)
import xgboost
import nltk
import string
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Global Variable Declaration

In [2]:
BASE_DIR = 'https://raw.githubusercontent.com/Dutta-SD/NLP/master'
TRAIN_URL_TASK_1 = f'{BASE_DIR}/Aggression_Detection/Aug_Data_Aggression/TASK_A_train_aug_english.csv'
TRAIN_URL_TASK_2 = f'{BASE_DIR}/Aggression_Detection/Aug_Data_Aggression/TASK_B_train_aug_english.csv'

VAL_URL = f'{BASE_DIR}/Aggression_Detection/trac2_eng_dev.csv'
task_1_map ={
    'NAG' : 0,
    'CAG' : 1,
    'OAG' : 2
}

task_2_map = {
    'NGEN' : 0,
    'GEN' : 1
}

# Utility functions

In [3]:
def seed_all(x):
  np.random.seed(x)
  
seed_all(0)

In [4]:
def clean_one_text(text):
    # Cleans one text and returns it    
    
    # remove punctuation
    filter_str = string.punctuation.replace("\'", "")

    new_string = text.translate(str.maketrans('', '', filter_str))
    tk = nltk.TweetTokenizer()

    s = set(nltk.corpus.stopwords.words('english'))
    # n't words
    rexp_1 = re.compile(r"n't")
    not_words = set(filter(rexp_1.findall, s))
    not_words.update(('against', 'no', 'nor', 'not'))

    s.difference_update(not_words)

    stmr = nltk.stem.porter.PorterStemmer()
    tokens = [token for token in tk.tokenize(new_string) if token.lower() not in s]
    clean_tokens = [stmr.stem(token) for token in tokens]
    text = ' '.join(clean_tokens)
    return text

In [5]:
def get_clean_dataset(
    df_raw,
    target_mapping,
    train = True,
    task_name='A', 
    string_cleaner=None,
    seed = 0):
  '''
  ===============================================================
  get_clean_dataset - cleans the dataset, returns text and labels
  ===============================================================

  :df_raw - pandas dataframe for cleaning
  :target_mapping - map for the targets
  :train - flag to see if training data sent or not
  :task_name - the target to predict
  :preprocessor - preprocesses the string
  :string_cleaner - useful for removing punctuation, etc(function)
  '''
  
  #Shuffle
  df_raw = df_raw.sample(frac=1).reset_index()

  col_str = f'Sub-task {task_name}'

  if 'ID' in df_raw.columns:
    df_raw = df_raw.drop(['ID'], axis = 1)

  targets = df_raw[col_str].map(target_mapping).values
  text = df_raw['Text'].values.astype('str')

  if string_cleaner is not None:
    v_cleaner = np.vectorize(string_cleaner)
    text = v_cleaner(text)

  return text, targets

In [6]:
def get_cf_pipe(c_weights, seed = 0):
    cf_model = xgboost.XGBClassifier(
        max_depth = 4, 
        gamma = 0.1 ,
        scale_pos_weight=c_weights, 
        random_state=seed,
        n_jobs = -1
    )
    cf_pipe = pipeline.Pipeline(
        [
            ('feature-extractor', feature_extraction.text.TfidfVectorizer()),
            ('norm', preprocessing.Normalizer()),
            ('pow-trans', preprocessing.QuantileTransformer(output_distribution='normal')),
            ('classifier', cf_model)     
        ],
        verbose = True
    )
    return cf_pipe

In [7]:
def get_data_and_train_model(
    task_name,    
    target_map,
    string_cleaner,
    seed = 0,
    verbose = True,
    ):
    '''
    Returns the trained model
    '''

    # train_data_url
    train_url = None
    if task_name == 'A':
        train_url = TRAIN_URL_TASK_1
    else:
        train_url = TRAIN_URL_TASK_2
    
    if verbose:
        print('URL SELECTED')    

    # Get data
    df = pd.read_csv(train_url)

    if verbose:
        print(f'FETCHED DATA FROM {train_url}')

    # Clean data
    X_train, y_train = get_clean_dataset(
        df,
        target_map,
        task_name=task_name,
        string_cleaner=string_cleaner,
        seed=seed
    )

    # Get class weights
    weights = np.bincount(y_train)
    weights = {i : weights.sum() / weights[i] for i in range(len(weights))}

    # Otherwise XGBOOST Throws error
    if len(weights) == 2:
        weights = weights[1] / weights[0]

    if verbose:
        print('CLEANING DONE')

    cf_pipe = get_cf_pipe(weights, seed=seed)

    # Fit the model
    if verbose:
        print('FITTING...')
    cf_pipe.fit(X_train, y_train)

    if verbose:
        print('TRAINING DONE')    

    return {
        'model' : cf_pipe,
    }

In [8]:
def validate(
    clf_pipe,
    task_name,
    target_map,
    string_cleaner,
    seed=0,
    verbose=True
    ):
    # Returns Score
    if verbose:
        print('Validating...')

    val_url = VAL_URL
    df = pd.read_csv(val_url)

    X_val, y_val = get_clean_dataset(
        df,
        target_map,
        train=False,
        task_name = task_name,
        string_cleaner=string_cleaner,
        seed=seed
    )

    y_preds = clf_pipe.predict(X_val)
    y_true = y_val

    # Return classification report
    return {
        'f1_weighted' : metrics.f1_score(y_true, y_preds, average='weighted')
    }

# Training

In [9]:
# Train for task 1
# Task - 1
# Train
clf_pipe_task_1 = get_data_and_train_model(
    'A',
    task_1_map,
    clean_one_text,
)['model']

# Validate
validate(
    clf_pipe_task_1,
    'A',
    task_1_map,
    clean_one_text,
)

URL SELECTED
FETCHED DATA FROM https://raw.githubusercontent.com/Dutta-SD/NLP/master/Aggression_Detection/Aug_Data_Aggression/TASK_A_train_aug_english.csv
CLEANING DONE
FITTING...
[Pipeline] . (step 1 of 4) Processing feature-extractor, total=   0.2s
[Pipeline] .............. (step 2 of 4) Processing norm, total=   0.0s
[Pipeline] ......... (step 3 of 4) Processing pow-trans, total=  10.3s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   6.8s
TRAINING DONE
Validating...


{'f1_weighted': 0.7650582678981483}

In [10]:
# Train for task 2
# Task - 2
# Train
clf_pipe_task_2 = get_data_and_train_model(
    'B',
    task_2_map,
    clean_one_text,
)['model']

# Validate
validate(
    clf_pipe_task_2,
    'B',
    task_2_map,
    clean_one_text,
)

URL SELECTED
FETCHED DATA FROM https://raw.githubusercontent.com/Dutta-SD/NLP/master/Aggression_Detection/Aug_Data_Aggression/TASK_B_train_aug_english.csv
CLEANING DONE
FITTING...
[Pipeline] . (step 1 of 4) Processing feature-extractor, total=   0.1s
[Pipeline] .............. (step 2 of 4) Processing norm, total=   0.0s
[Pipeline] ......... (step 3 of 4) Processing pow-trans, total=   9.1s
[Pipeline] ........ (step 4 of 4) Processing classifier, total=   1.8s
TRAINING DONE
Validating...


{'f1_weighted': 0.9132994360640823}