In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# ! pip3 install transformers

In [3]:
# ! pip install ipywidgets

In [2]:
from transformers import BertTokenizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
df = pd.read_csv("../data/training_data.csv", usecols=['id', 'text', 'user', 'user_verified',
       'user_followers_count', 'user_friends_count', 'retweet_count',
       'fav_count', 'hashtags', 'target'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12756 entries, 0 to 12755
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    12756 non-null  int64 
 1   text                  12756 non-null  object
 2   user                  12756 non-null  object
 3   user_verified         12756 non-null  bool  
 4   user_followers_count  12756 non-null  int64 
 5   user_friends_count    12756 non-null  int64 
 6   retweet_count         12756 non-null  int64 
 7   fav_count             12756 non-null  int64 
 8   hashtags              12756 non-null  object
 9   target                12756 non-null  object
dtypes: bool(1), int64(5), object(4)
memory usage: 909.5+ KB


In [6]:
df.head()

Unnamed: 0,id,text,user,user_verified,user_followers_count,user_friends_count,retweet_count,fav_count,hashtags,target
0,572332655397629952,These girls are the equivalent of the irritati...,JL_Whitaker,False,2189,2186,0,2,"[{'text': 'MKR', 'indices': [95, 99]}]","4 racism\nName: types, dtype: object"
1,572341498827522049,Drasko they didn't cook half a bird you idiot ...,trish2295,False,14,62,0,4,"[{'text': 'mkr', 'indices': [46, 50]}]","607 neither\nName: Expert, dtype: object"
2,572340476503724032,Hopefully someone cooks Drasko in the next ep ...,foodbling,False,3432,2529,0,2,"[{'text': 'MKR', 'indices': [49, 53]}]","6035 neither\nName: Expert, dtype: object"
3,572334712804384768,of course you were born in serbia...you're as ...,lilbeastunleash,False,529,1848,0,0,"[{'text': 'MKR', 'indices': [71, 75]}]","3 racism\nName: types, dtype: object"
4,572342978255048705,So Drasko just said he was impressed the girls...,thefoxbandit,False,7,0,0,2,"[{'text': 'MKR', 'indices': [96, 100]}]","429 neither\nName: Expert, dtype: object"


In [7]:
targets = ['racism', 'sexism', 'neither']

In [8]:
df['processed_target'] = df.target.apply(lambda x: x.split()[1])

In [9]:
df['processed_target'] = df.processed_target.apply(lambda x: 'neither' if x == 'none' else x)

In [10]:
df = df[df['processed_target'].isin(targets)]

In [11]:
df['processed_target'].value_counts()

neither    9444
sexism     3220
racism       67
Name: processed_target, dtype: int64

In [12]:
label_map = {
    "racism": 0,
    "sexism": 1,
    "neither": 2
}

In [13]:
reverse_label_map = {
    0: "racism",
    1: "sexism", 
    2: "neither"
}

In [14]:
df['labels'] = df.processed_target.apply(lambda x: label_map[x])

In [15]:
df['labels']

0        0
1        2
2        2
3        0
4        2
        ..
12751    2
12752    2
12753    2
12754    2
12755    1
Name: labels, Length: 12731, dtype: int64

In [16]:
vocab = set()

def process_text(text):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    vocab.update(set(tokens))
    return tokens

In [17]:
process_text("this is a sample textss")

['this', 'is', 'a', 'sample', 'texts', '##s']

In [18]:
vocab

{'##s', 'a', 'is', 'sample', 'texts', 'this'}

In [19]:
df['processed_text'] = df.text.apply(process_text)

In [20]:
df['processed_text']

0        [these, girls, are, the, equivalent, of, the, ...
1        [dr, ##ask, ##o, they, didn, ', t, cook, half,...
2        [hopefully, someone, cooks, dr, ##ask, ##o, in...
3        [of, course, you, were, born, in, serbia, ., ....
4        [so, dr, ##ask, ##o, just, said, he, was, impr...
                               ...                        
12751    [rt, @, quinn, ##ae, _, moon, :, that, ', s, n...
12752    [but, this, just, goes, to, prove, -, @, ty, #...
12753    [rt, @, olsen, ##31, ##shan, ##non, :, wow, i,...
12754    [rt, @, j, _, big, ##boot, ##e, :, @, free, ##...
12755    [via, @, weasel, ##zi, ##pper, ##s, :, fe, ##m...
Name: processed_text, Length: 12731, dtype: object

In [37]:
with open('../data/text.json', 'w') as f:
    json.dump(df['text'].tolist(), f, indent=2)