In [3]:
import csv
import random

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import torchtext
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
import spacy

In [10]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [26]:
FILE_PATH = '../data/dontpatronizeme_pcl.tsv'

# Train and Test data split data
TRAIN_DATASET_IDX_PATH = '../data/train_test_split/train_semeval_parids-labels.csv'
DEV_DATASET_IDX_PATH = '../data/train_test_split/dev_semeval_parids-labels.csv'

SEED = 234

BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
rows = []

with open(FILE_PATH) as f:
    for line in f.readlines()[4:]:
        par_id=int(line.strip().split('\t')[0])
        art_id = line.strip().split('\t')[1]
        keyword= line.strip().split('\t')[2]
        country= line.strip().split('\t')[3]
        text = line.strip().split('\t')[4]
        label = int(line.strip().split('\t')[-1])
        if label == 0 or label == 1:
            lbin = 0
        else:
            lbin = 1
        rows.append(
            {'par_id':par_id,
            'art_id':art_id,
            'keyword':keyword,
            'country':country,
            'text':text, 
            'label':lbin, 
            'orig_label':label
            }
            )

df=pd.DataFrame(rows, columns=['par_id', 'art_id', 'keyword', 'country', 'text', 'label', 'orig_label']) 

In [6]:
# Read indices of train and dev data
train_idx_df = pd.read_csv(TRAIN_DATASET_IDX_PATH)
dev_idx_df = pd.read_csv(DEV_DATASET_IDX_PATH)
print("Train idx\n", train_idx_df.dtypes)
print("Dev idx\n", dev_idx_df.dtypes)
print("Dataframe dtypes\n", df.dtypes)

Train idx
 par_id     int64
label     object
dtype: object
Dev idx
 par_id     int64
label     object
dtype: object
Dataframe dtypes
 par_id         int64
art_id        object
keyword       object
country       object
text          object
label          int64
orig_label     int64
dtype: object


In [7]:
df.iloc[0]['text']

"We 're living in times of absolute insanity , as I 'm pretty sure most people are aware . For a while , waking up every day to check the news seemed to carry with it the same feeling of panic and dread that action heroes probably face when they 're trying to decide whether to cut the blue or green wire on a ticking bomb -- except the bomb 's instructions long ago burned in a fire and imminent catastrophe seems the likeliest outcome . It 's hard to stay that on-edge for that long , though , so it 's natural for people to become inured to this constant chaos , to slump into a malaise of hopelessness and pessimism ."

In [21]:
from sklearn.model_selection import train_test_split

train_df = df[df['par_id'].isin(train_idx_df['par_id'].values)]
dev_df = df[df['par_id'].isin(dev_idx_df['par_id'].values)]


def filter_features(df, features):
    return df[features]

# TODO: Add more features, like keyword, country, etc.
train_df = filter_features(train_df, ['text', 'label'])
dev_df = filter_features(dev_df, ['text', 'label'])

print(f"Train dataset size: {len(train_df)} | Train idxs: {len(train_idx_df)}")
print(f"Dev (Test) dataset size: {len(dev_df)} | Train idxs: {len(dev_idx_df)}")
print(f"Original Dataset Size: {len(df)}")
assert len(df) == len(train_df) + len(dev_df)

train_data, valid_data = train_test_split(train_df, test_size=0.2, random_state=SEED)
test_data = dev_df

# Save train, dev and test data
train_data.to_csv('../data/train.tsv', sep='\t', index=False, quoting=csv.QUOTE_NONE, escapechar='\\')
valid_data.to_csv('../data/valid.tsv', sep='\t', index=False, quoting=csv.QUOTE_NONE, escapechar='\\')
test_data.to_csv('../data/dev.tsv', sep='\t', index=False, quoting=csv.QUOTE_NONE, escapechar='\\')

Train dataset size: 8375 | Train idxs: 8375
Dev (Test) dataset size: 2094 | Train idxs: 2094
Original Dataset Size: 10469


In [22]:
# Parse dataframes to torchtext datasets
spacy_en = spacy.load('en_core_web_sm')
spacy_stop_words = spacy.lang.en.stop_words.STOP_WORDS
print(spacy_stop_words)

def tokenizer(text): # create a custom tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

# TODO: Add more features such as country, keyword, etc.
text_field = Field(tokenize=tokenizer, lower=True, stop_words=spacy_stop_words)
label_field = Field(sequential=False, use_vocab=False) # we set sequential to false as we don't tokenise our labels


# order should match the columns order in our csv/tsv file
# if no processing was required, we set None
data_fields = [('Text', text_field), ('Label', label_field)]

# We will load our csv files into Dataset objects 
train, val, test = TabularDataset.splits(
    path = '../data/',
    train = 'train.tsv',
    validation = 'valid.tsv',
    test = 'dev.tsv',
    format = 'tsv',
    fields = data_fields,
    skip_header = True
)

# possible dimensions for glove embeddings
EMBEDDING_DIM = [25, 50, 100, 200, 300]

text_field.build_vocab(train,max_size=25000, vectors=f"glove.6B.{EMBEDDING_DIM[1]}d")
label_field.build_vocab(train) 


{'him', 'somewhere', "'s", '’ll', 'ten', "'ve", 'less', '‘ll', 'either', 'no', 'regarding', 'therein', 'whither', 'whoever', 'nobody', 'all', 'they', 'why', 'becoming', 'if', 'only', "'re", 'as', 'any', 'can', 'hereby', 'their', 'wherein', '’d', 'whether', 'also', 'our', 'empty', 'one', 'seems', 'somehow', 'top', 'fifty', 'not', 'thence', 'third', 'both', 'nor', 'neither', 'above', 'almost', 'themselves', 'show', '‘re', 'wherever', 'part', 'upon', 'many', 'former', 'whereas', '‘m', 'about', 'more', 'already', 'during', 'whose', 'put', 'has', 'whence', 'together', 'towards', 'me', 'never', "n't", 'another', 'and', 'forty', 'you', 'he', 'or', 'anyone', 'for', 'side', 'amount', 'until', 'an', 'see', 'thru', 'eight', 'were', 'what', 'last', 'such', 'further', 'via', '’s', 'else', 'hence', 'every', 're', 'ourselves', 'around', 'i', 'elsewhere', 'was', 'through', 'really', 'been', 'hers', "'ll", 'first', 'hereafter', 'do', 'doing', 'two', 'twelve', 'is', 'although', 'which', 'three', 'beyond

In [28]:
train_iter, val_iter, test_iter = BucketIterator.splits(
        (train, val, test),
        batch_sizes= (BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
        sort_key=lambda x: len(x.Text), device=device)