In [27]:
import pandas as pd
import os
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import os
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import roc_auc_score
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import string
from nltk.corpus import stopwords

from sklearn.feature_extraction import text

nltk.download('stopwords')
stops=set(stopwords.words('english'))
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)


# Metadata

In [17]:
metadata=pd.read_csv('data/annotations_metadata.csv',index_col='file_id')

In [18]:
train_list_file_id=[filename.split('.')[0] for filename in os.listdir('data/sampled_train/')]
test_list_file_id=[filename.split('.')[0] for filename in os.listdir('data/sampled_test/')]
all_list_file_id=[filename.split('.')[0] for filename in os.listdir('data/all_files/')]

In [19]:
def apply_fn(x):
    if x.name in train_list_file_id:
        return 'sample_train'
    elif x.name in test_list_file_id:
        return 'sample_test'
    elif x.name in all_list_file_id:
        return 'all'
    
metadata['split']=metadata.apply(apply_fn,axis=1)

In [20]:
def apply_fn(x):
    if x.name in train_list_file_id:
        return './data/sampled_train/'+x.name+'.txt'
    elif x.name in test_list_file_id:
        return './data/sampled_test/'+x.name+'.txt'
    elif x.name in all_list_file_id:
        return './data/all_files/'+x.name+'.txt'
    
metadata['file_path']=metadata.apply(apply_fn,axis=1)    

In [21]:
meta_train=metadata[metadata.split=='sample_train']
meta_test=metadata[metadata.split=='sample_test']
meta_all=metadata[metadata.split=='all']

# Data

## Preprocessing

In [22]:
dirs={'train':list(meta_train.file_path),
      'test':list(meta_test.file_path)}

In [23]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
datasets = load_dataset('text', data_files=dirs)




In [24]:
def add_file_name(ex, idx, file_path):
    ex['file_id'] = file_path[idx].split('/')[-1][:-4]
    return ex

train_ds = datasets['train'].map(add_file_name, with_indices=True, fn_kwargs={'file_path': dirs['train']})
test_ds = datasets['test'].map(add_file_name, with_indices=True, fn_kwargs={'file_path': dirs['test']})

train_ds=train_ds.map(lambda x : {'label':metadata.loc[x['file_id'],'label']})
test_ds=test_ds.map(lambda x : {'label':metadata.loc[x['file_id'],'label']})

In [None]:
def label_to_int(ex):
    label_map = {'noHate': 0, 'hate': 1}
    ex['label'] = label_map[ex['label']]
    return ex

def len_text(ex):
    ex['len_toks']=len(ex['input_ids'])-2
    return ex

train_ds = train_ds.map(label_to_int)
train_ds = train_ds.map(len_text)


test_ds = test_ds.map(label_to_int)
test_ds = test_ds.map(len_text)

In [10]:

def tokenize_function(ex):
    return tokenizer(ex['text'],padding=True, truncation=True, max_length=512)


def len_toks(ex):
    ex['len_toks']=len(ex['input_ids'])-2
    return ex

train_ds = train_ds.map(label_to_int)
train_ds = train_ds.map(tokenize_function, batched=False)
train_ds = train_ds.map(len_toks)


test_ds = test_ds.map(label_to_int)
test_ds = test_ds.map(tokenize_function, batched=False)
test_ds = test_ds.map(len_toks)

In [11]:

train_ds.save_to_disk("data/train_ds")
test_ds.save_to_disk("data/test_ds")

Saving the dataset (1/1 shards): 100%|██████████| 1914/1914 [00:00<00:00, 39636.11 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 478/478 [00:00<00:00, 11064.08 examples/s]


# Exploration

## Length of docs

In [274]:
train_df=pd.DataFrame(train_ds)
test_df=pd.DataFrame(test_ds)

In [275]:
bin_size =3
x_start = min(min(train_df['len_toks']), min(test_df['len_toks']))  # Start bin at the minimum value of both lists
x_end = max(max(train_df['len_toks']), max(test_df['len_toks'])) + bin_size  

In [276]:
fig = go.Figure()

# Add traces for each list
fig.add_trace(go.Histogram(x=train_df['len_toks'],
                            name='Train Set', 
                            opacity=0.75,
                            histnorm='percent',
                            xbins=dict(start=x_start, end=x_end, size=bin_size)  ))
fig.add_trace(go.Histogram(x=test_df['len_toks'],
                            name='Test Set',
                            opacity=0.75,
                            histnorm='percent',
                            xbins=dict(start=x_start, end=x_end, size=bin_size) ))
fig.update_layout(
    title_text='Comparison Train/Test Distributions', # title of plot
    barmode='overlay', # Allows for bars to be overlayed
    xaxis_title_text='Value', # x-axis label
    yaxis_title_text='Count', # y-axis label
)
fig.show()

## Most common words

In [280]:
def filter_stops(x):
    x=[word.lower() for word in x]
    return [word for word in x if word not in stops]

def filter_punct(x):
    processed=[]
    for word in x:
        include=True
        for char in word:
            if char in string.punctuation:
                include=False
                break
        if include:
            processed.append(word)
    return [word for word in x if word not in string.punctuation]

In [281]:

train_df['filtered_words']=train_df.text.str.split().apply(filter_stops).apply(filter_punct)
test_df['filtered_words']=test_df.text.str.split().apply(filter_stops).apply(filter_punct)

In [282]:
flattened_train = [word for wordlist in train_df.filtered_words for word in wordlist]
flattened_test = [word for wordlist in test_df.filtered_words for word in wordlist]

In [283]:
from collections import Counter

# Count the frequency of each word
train_counts = Counter(flattened_train)
test_counts = Counter(flattened_test)
sorted_train = dict(list(reversed(sorted(train_counts.items(), key=lambda x: x[1])))[:50])
sorted_test = dict(list(reversed(sorted(test_counts.items(), key=lambda x: x[1])))[:50])

In [284]:
import plotly.express as px

# Create a bar chart
fig = px.bar(x=sorted_train.keys() , y=sorted_train.values(), labels={'x': 'vocabulary', 'y': 'count'},title='Train Vocabulary count')

# Show the plot
fig.show()

In [285]:
import plotly.express as px

# Create a bar chart
fig = px.bar(x=sorted_test.keys() , y=sorted_test.values(), labels={'x': 'vocabulary', 'y': 'count'},title='Vocabulary count')

# Show the plot
fig.show()

## Words by category

In [289]:
flattened_Hate = [word for wordlist in list(train_df[train_df.label==1].filtered_words)+list(test_df[test_df.label==1].filtered_words) for word in wordlist]
flattened_Nohate = [word for wordlist in list(train_df[train_df.label==0].filtered_words)+list(test_df[test_df.label==0].filtered_words) for word in wordlist]

In [290]:
from collections import Counter

# Count the frequency of each word
Hate_counts = Counter(flattened_Hate)
Nohate_counts = Counter(flattened_Nohate)
sorted_Hate = dict(list(reversed(sorted(Hate_counts.items(), key=lambda x: x[1])))[:50])
sorted_Nohate = dict(list(reversed(sorted(Nohate_counts.items(), key=lambda x: x[1])))[:50])

In [292]:
import plotly.express as px

# Create a bar chart
fig = px.bar(x=sorted_train.keys() , y=sorted_train.values(), labels={'x': 'vocabulary', 'y': 'count'},title='Train Vocabulary count')

# Show the plot
fig.show()

In [293]:
import plotly.express as px

# Create a bar chart
fig = px.bar(x=sorted_Nohate.keys() , y=sorted_Nohate.values(), labels={'x': 'vocabulary', 'y': 'count'},title='Vocabulary count')

# Show the plot
fig.show()