In [2]:
import numpy as np
import pandas as pd
import transformers
from datasets import Dataset,load_dataset, load_from_disk, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import random as rn
import datasets

### Load Dataset

In [3]:
# Load Twitter Dataset
print("Load Twitter Data:")
twitter_df_train = pd.read_csv('data/twitter_train.csv', on_bad_lines='skip')
twitter_df_train = twitter_df_train.drop(columns=['language'])
twitter_df_train = twitter_df_train.rename(columns={'text': 'document', 'label': 'labels'})
print("Dataset size:", len(twitter_df_train), '\n')
twitter_df_train.info()
print("=="*50)

# Load Reddit Dataset
print("Load Reddit Data:")
reddit_df_train = pd.read_csv('data/annotated_question_intimacy_data/final_train.txt',
                              sep='\t', header=None, names=['document', 'labels'])
print("Dataset size:", len(reddit_df_train), '\n')
reddit_df_train.info()
print("=="*50)

# Combine Data
print("Combine Twitter&Reddit Data:'")
combined_df = pd.concat([twitter_df_train,reddit_df_train])
print("Dataset size:", len(combined_df), '\n')
combined_df.info()
print("=="*50)

Load Twitter Data:
Dataset size: 9491 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9491 entries, 0 to 9490
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   document  9491 non-null   object 
 1   labels    9491 non-null   float64
dtypes: float64(1), object(1)
memory usage: 148.4+ KB
Load Reddit Data:
Dataset size: 1797 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1797 entries, 0 to 1796
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   document  1797 non-null   object 
 1   labels    1797 non-null   float64
dtypes: float64(1), object(1)
memory usage: 28.2+ KB
Combine Twitter&Reddit Data:'
Dataset size: 11288 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11288 entries, 0 to 1796
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   document  11288 non-null  object 
 1   labels   

In [4]:
# Linear Transformation for Reddit
A, B, C, D = -1, 1, 1, 5
scale = (D-C)/(B-A)
offset = -A*(D-C)/(B-A) + C

for index, row in reddit_df_train.iterrows():
  iScore = row['labels']

  # If the cell is re-run without clearing local variables, we'll
  # double convert the values between the 1-5 range resulting in values between
  # 5-10. This condition makes sure original scores from Reddit are not already
  #  greater than 1.
  if iScore > 1:
    break

  q = iScore * scale + offset
  reddit_df_train.at[index, 'labels'] = round(q, 1)

reddit_df_train.head()

Unnamed: 0,document,labels
0,What are the most mediocre animals in the anim...,2.3
1,What's the difference between an allergic reac...,3.1
2,What is your favorite subreddit that not every...,3.1
3,What's the most disgusting meal you've ever ea...,3.5
4,Whats one question you hate being asked?,4.0


### Process to Dataset for Huggingface

In [5]:
twitter_dataset = Dataset.from_pandas(twitter_df_train)
twitter_dataset = twitter_dataset.train_test_split(test_size=0.2)
twitter_dataset

reddit_dataset = Dataset.from_pandas(reddit_df_train)
reddit_dataset = reddit_dataset.train_test_split(test_size=0.2)
reddit_dataset

combined_dataset = Dataset.from_pandas(combined_df)
combined_dataset = combined_dataset.train_test_split(test_size=0.2)
combined_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'labels', '__index_level_0__'],
        num_rows: 9030
    })
    test: Dataset({
        features: ['document', 'labels', '__index_level_0__'],
        num_rows: 2258
    })
})

In [6]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base", cache="/Users/qinyuanzheng/huggingface_cache/models")

In [8]:
sentence = "If you trust them they will always be here for us too 💕💕💕"

In [10]:
tokens = tokenizer.tokenize(sentence)
print(tokens)

['▁If', '▁you', '▁trust', '▁them', '▁they', '▁will', '▁always', '▁be', '▁here', '▁for', '▁us', '▁too', '▁', '💕', '💕', '💕']


In [11]:
token_id = tokenizer.convert_tokens_to_ids(tokens)
print(token_id)

[4263, 398, 63207, 2856, 1836, 1221, 11343, 186, 3688, 100, 1821, 5792, 6, 178556, 178556, 178556]


In [19]:
def tokenize_function(examples):
    return tokenizer(examples["document"], padding="max_length", truncation=True)

twitter_tokenized_datasets = twitter_dataset.map(tokenize_function, batched=True)
twitter_tokenized_datasets
reddit_tokenized_datasets = reddit_dataset.map(tokenize_function, batched=True)
reddit_tokenized_datasets

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1437
    })
    test: Dataset({
        features: ['document', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 360
    })
})

In [20]:
# Selecting a small sample of rows from training and testing datasets will help the
# model train quickly.
twitter_tokenized_datasets_train = twitter_tokenized_datasets["train"].shuffle(seed=42)
twitter_tokenized_datasets_test = twitter_tokenized_datasets["test"].shuffle(seed=42)

In [21]:
print(len(twitter_tokenized_datasets_test))
print(len(twitter_tokenized_datasets_train))

1899
7592


In [None]:
lang_dict = {
    "Spanish": "es",
    "English": "en",
    "Chinese": "zh",
    "French": "fr",
    "Italian": "it",
    "Portuguese": "pt",
    "Korean": "ko",
    "Dutch": "nl",
    "Hindi": "hi",
    "Arabic": "ar"
}