<a href="https://colab.research.google.com/github/GYIKOO/UCIMLHackathon21/blob/main/Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [11]:
from transformers import BertTokenizer
import pandas as pd
from keras.preprocessing.sequence import pad_sequences

In [52]:
# Config
MAX_LEN = 32

# Import Data

In [6]:
df = pd.read_csv('/content/covid_lies.csv')
df.head()

Unnamed: 0,misconception_id,misconception,tweet,tweet_id,label
0,3,Coronavirus is genetically engineered.,How the COVID-19 outbreak is changing global p...,1233965490948591616,na
1,30,Blowing conch shells destroys coronavirus pote...,Getting coronavirus and then coughing on peopl...,1233907923765559296,na
2,57,Swans and dolphins swimming in Venice canals f...,Disturbing letter about life in COVID-19 ward ...,1233911842910720000,na
3,22,Cocaine cures coronavirus.,How to prevent corona virus?🤔 Use cowdung cake...,1233947734094290944,na
4,32,Observing janata curfew will result in the red...,This is concerning - They must self-insure for...,1233937085297332224,na


# Words Embedding

In [53]:
def vectorize(text,MAX_LEN=MAX_LEN):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    input_ids = []
    for t in text:
        # so basically encode tokenizing , mapping sentences to thier token ids after adding special tokens.
        encoded_sent = tokenizer.encode(
            t,  # Sentence which are encoding.
            add_special_tokens=True,  # Adding special tokens '[CLS]' and '[SEP]'
        )
        input_ids.append(encoded_sent)

    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN , truncating="post", padding="post")

    attention_masks = []
    for sent in input_ids:
        # Generating attention mask for sentences.
        #   - when there is 0 present as token id we are going to set mask as 0.
        #   - we are going to set mask 1 for all non-zero positive input id.
        att_mask = [int(token_id > 0) for token_id in sent]

        attention_masks.append(att_mask)

    return attention_masks, input_ids

### Misconception Text

In [65]:
miscon_df = df[['misconception_id','misconception']].drop_duplicates(subset='misconception_id').sort_values(by='misconception_id').reset_index(drop=True)
miscon_df

Unnamed: 0,misconception_id,misconception
0,1,A person can tell if they have coronavirus or ...
1,2,Drinking large amounts of water will protect a...
2,3,Coronavirus is genetically engineered.
3,4,Dean Koontz predicted the pandemic in his 1981...
4,5,The first person infected is a researcher name...
...,...,...
57,58,Water polution decreased in Venice canals foll...
58,59,A Malabar civet was spotted walking the street...
59,60,A pod of humpback whales returned to the Arabi...
60,61,Lions were freed to keep people off the street...


In [93]:
m_text = miscon_df.misconception.values
m_attention_mask, m_input_id = vectorize(m_text)

In [89]:
m_input_id.shape

(62, 32)

In [94]:
m_input_id

array([[  101,  1037,  2711, ...,     0,     0,     0],
       [  101,  5948,  2312, ...,     0,     0,     0],
       [  101, 21887, 23350, ...,     0,     0,     0],
       ...,
       [  101,  1037, 17491, ...,     0,     0,     0],
       [  101,  7212,  2020, ...,     0,     0,     0],
       [  101,  2924,  1997, ...,     0,     0,     0]], dtype=int32)

## Tweets Text

In [69]:
tweets_df = df[['tweet_id','tweet']].drop_duplicates(subset='tweet').sort_values(by='tweet_id')
tweets_df

Unnamed: 0,tweet_id,tweet
3212,1230000000000000000,How long does the new coronavirus remain activ...
3827,1230000000000000000,Millimetre wave technology will kill an indivi...
1146,1230000000000000000,How do we strengthen our immune systems to fig...
5497,1233904797775990784,"When it comes to coronavirus, who is the most ..."
1624,1233904858601840640,#Coronavirus and China's Tax Response\n\n@USER...
...,...,...
2262,1233997748292091904,Forget the mask. Here's how to protect yoursel...
3303,1233997751354101760,"Man Who Returned From Malaysia, Dies In Kerala..."
4058,1233997773416083456,.@miafarrow Dug up this report on exquisite ki...
4385,1233997817087062016,Washington state health officials are investig...


In [110]:
t_text = tweets_df.tweet.values
t_attention_mask, t_input_id = vectorize(t_text,64)

In [103]:
t_input_id.shape

(4346, 64)

In [111]:
t_input_id

array([[  101,  2129,  2146, ...,     0,     0,     0],
       [  101,  4971, 14428, ...,  1012,  2522,  1013],
       [  101,  2129,  2079, ...,     0,     0,     0],
       ...,
       [  101,  1012,  1030, ...,  1012,  2659,  3891],
       [  101,  2899,  2110, ...,     0,     0,     0],
       [  101, 23848,  2050, ...,     0,     0,     0]], dtype=int32)