In [1]:
import pandas as pd
import os
import numpy as np
import html
import re
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


# Settings

In [2]:
MAX_LEN = 512
BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased', do_lower_case=True) # Need to retrain with do_lower_case=False
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=1).to(device)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifi

# Preprocessing

In [3]:
DATA_DIR = '../data/raw/'
filenames = os.listdir(DATA_DIR)
dfs = [pd.read_csv(DATA_DIR + name, index_col='Unnamed: 0') for name in filenames]

In [4]:
df = pd.concat(dfs)
df = df.sample(frac=1)
df = df[['reply', 'comment_score', 'reply_score']]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,reply,comment_score,reply_score
0,I love this interpretation,23,2
1,Thanks for the insight. I have some questions ...,22,1
2,"Definitely, all characters should be flawed ju...",4,3
3,Anywhere really.\n\nOne of my personal values ...,178,10
4,"I know there is his book *On Writing*, which i...",1,2


In [5]:
def regex_text(text):
    text = html.unescape(text)
    text = re.sub(r"\\'", r"'", text)
    text = re.sub(r"\s+$", '', text)    
    return text

def clean_dataframe(df):
    df['reply'] = df['reply'].astype(str)
    df['reply'] = df['reply'].apply(regex_text)
    df = df[df['reply'].str.len() != 0]
    return df

df = clean_dataframe(df)
df.head()

Unnamed: 0,reply,comment_score,reply_score
0,I love this interpretation,23,2
1,Thanks for the insight. I have some questions ...,22,1
2,"Definitely, all characters should be flawed ju...",4,3
3,Anywhere really.\n\nOne of my personal values ...,178,10
4,"I know there is his book *On Writing*, which i...",1,2


In [6]:
def minmax_scale(X, X_min, X_max):
    X_scaled = (X - X_min) / (X_max - X_min)
    return X_scaled

#print(df['comment_score'].)

# Scaling is [MinMax -> np.exp -> MinMax] such that the comments/replies with a higher count have more influence
def scale(df, cols=['comment_score', 'reply_score']):
    min_score = df[cols].min().min()
    print("Min score: ", min_score)
    max_score = df[cols].max().max()
    print("Max score: ", max_score)
    for col in cols:
        df[col+"_scaled"] = df[col]
        col = col+"_scaled"
        df[col] = df[col].apply(minmax_scale, args=(min_score, max_score))
        df[col] = df[col].apply(np.exp)

    cols = ['comment_score_scaled', 'reply_score_scaled']
    min_score = df[cols].min().min()
    print("Min score: ", min_score)
    max_score = df[cols].max().max()
    print("Max score: ", max_score)
    for col in cols:
        df[col] = df[col].apply(minmax_scale, args=(min_score, max_score))
    return df

df = scale(df)
df['reply_score_minmax'] = df['reply_score'].apply(minmax_scale, args=(df['reply_score'].min(), df['reply_score'].max()))
df['score_ratio'] = df['reply_score_scaled']/df['comment_score_scaled']

Min score:  -119
Max score:  8401
Min score:  1.0
Max score:  2.718281828459045


In [7]:
len_df = len(df)
split_ratio = 0.9
train = df.iloc[:int(split_ratio*len_df)][['reply', 'reply_score_minmax']].rename(columns={'reply_score_minmax': 'label'})
validation = df.iloc[int(split_ratio*len_df):][['reply', 'reply_score_minmax']].rename(columns={'reply_score_minmax': 'label'})

dataset = dict()
dataset['validation'] = Dataset.from_pandas(validation, preserve_index=False)
dataset['train'] = Dataset.from_pandas(train, preserve_index=False)
datasets = DatasetDict(dataset)

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["reply"], truncation=True, padding=True, max_length=MAX_LEN)

tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["reply"],
    )

                                                                                   

# Training

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="../output/bert_predictor",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    save_steps=10000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

trainer.train()

  0%|          | 0/31215 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  2%|▏         | 500/31215 [03:45<3:31:12,  2.42it/s]

{'loss': 0.0013, 'learning_rate': 1.967964119814192e-05, 'epoch': 0.02}


  3%|▎         | 1000/31215 [07:12<3:27:58,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 1.935928239628384e-05, 'epoch': 0.03}


  5%|▍         | 1500/31215 [10:38<3:24:52,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 1.903892359442576e-05, 'epoch': 0.05}


  6%|▋         | 2000/31215 [14:05<3:20:55,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 1.8718564792567677e-05, 'epoch': 0.06}


  8%|▊         | 2500/31215 [17:31<3:17:42,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 1.8398205990709596e-05, 'epoch': 0.08}


 10%|▉         | 3000/31215 [20:57<3:14:28,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.8077847188851515e-05, 'epoch': 0.1}


 11%|█         | 3500/31215 [24:24<3:10:53,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.7757488386993434e-05, 'epoch': 0.11}


 13%|█▎        | 4000/31215 [27:50<3:07:22,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.7437129585135353e-05, 'epoch': 0.13}


 14%|█▍        | 4500/31215 [31:17<3:03:39,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.7116770783277272e-05, 'epoch': 0.14}


 16%|█▌        | 5000/31215 [34:43<3:00:13,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.679641198141919e-05, 'epoch': 0.16}


 18%|█▊        | 5500/31215 [38:10<2:57:11,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.647605317956111e-05, 'epoch': 0.18}


 19%|█▉        | 6000/31215 [41:36<2:53:24,  2.42it/s]

{'loss': 0.0005, 'learning_rate': 1.615569437770303e-05, 'epoch': 0.19}


 21%|██        | 6500/31215 [45:03<2:50:01,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.5835335575844947e-05, 'epoch': 0.21}


 22%|██▏       | 7000/31215 [48:29<2:46:33,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.5514976773986866e-05, 'epoch': 0.22}


 24%|██▍       | 7500/31215 [51:55<2:42:58,  2.43it/s]

{'loss': 0.0, 'learning_rate': 1.5194617972128785e-05, 'epoch': 0.24}


 26%|██▌       | 8000/31215 [55:22<2:39:48,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 1.4874259170270704e-05, 'epoch': 0.26}


 27%|██▋       | 8500/31215 [58:48<2:36:06,  2.43it/s]

{'loss': 0.0002, 'learning_rate': 1.4553900368412623e-05, 'epoch': 0.27}


 29%|██▉       | 9000/31215 [1:02:14<2:32:39,  2.43it/s]

{'loss': 0.0, 'learning_rate': 1.4233541566554544e-05, 'epoch': 0.29}


 30%|███       | 9500/31215 [1:05:41<2:29:24,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.391318276469646e-05, 'epoch': 0.3}


 32%|███▏      | 10000/31215 [1:09:07<2:25:44,  2.43it/s]

{'loss': 0.0, 'learning_rate': 1.359282396283838e-05, 'epoch': 0.32}


 34%|███▎      | 10500/31215 [1:12:35<2:22:38,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.3272465160980299e-05, 'epoch': 0.34}


 35%|███▌      | 11000/31215 [1:16:01<2:19:05,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 1.295210635912222e-05, 'epoch': 0.35}


 37%|███▋      | 11500/31215 [1:19:27<2:16:02,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.2631747557264138e-05, 'epoch': 0.37}


 38%|███▊      | 12000/31215 [1:22:54<2:12:15,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 1.2311388755406055e-05, 'epoch': 0.38}


 40%|████      | 12500/31215 [1:26:20<2:08:41,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 1.1991029953547974e-05, 'epoch': 0.4}


 42%|████▏     | 13000/31215 [1:29:47<2:05:17,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.1670671151689895e-05, 'epoch': 0.42}


 43%|████▎     | 13500/31215 [1:33:13<2:02:10,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.1350312349831814e-05, 'epoch': 0.43}


 45%|████▍     | 14000/31215 [1:36:40<1:58:38,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.1029953547973731e-05, 'epoch': 0.45}


 46%|████▋     | 14500/31215 [1:40:07<1:55:14,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 1.070959474611565e-05, 'epoch': 0.46}


 48%|████▊     | 15000/31215 [1:43:36<1:51:37,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.038923594425757e-05, 'epoch': 0.48}


 50%|████▉     | 15500/31215 [1:47:02<1:48:10,  2.42it/s]

{'loss': 0.0, 'learning_rate': 1.006887714239949e-05, 'epoch': 0.5}


 51%|█████▏    | 16000/31215 [1:50:29<1:44:37,  2.42it/s]

{'loss': 0.0, 'learning_rate': 9.748518340541408e-06, 'epoch': 0.51}


 53%|█████▎    | 16500/31215 [1:53:55<1:41:21,  2.42it/s]

{'loss': 0.0, 'learning_rate': 9.428159538683325e-06, 'epoch': 0.53}


 54%|█████▍    | 17000/31215 [1:57:22<1:37:51,  2.42it/s]

{'loss': 0.0, 'learning_rate': 9.107800736825246e-06, 'epoch': 0.54}


 56%|█████▌    | 17500/31215 [2:00:49<1:34:25,  2.42it/s]

{'loss': 0.0, 'learning_rate': 8.787441934967163e-06, 'epoch': 0.56}


 58%|█████▊    | 18000/31215 [2:04:15<1:31:11,  2.42it/s]

{'loss': 0.0, 'learning_rate': 8.467083133109084e-06, 'epoch': 0.58}


 59%|█████▉    | 18500/31215 [2:07:42<1:27:53,  2.41it/s]

{'loss': 0.0, 'learning_rate': 8.146724331251001e-06, 'epoch': 0.59}


 61%|██████    | 19000/31215 [2:11:09<1:24:27,  2.41it/s]

{'loss': 0.0, 'learning_rate': 7.826365529392922e-06, 'epoch': 0.61}


 62%|██████▏   | 19500/31215 [2:14:36<1:20:49,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 7.506006727534839e-06, 'epoch': 0.62}


 64%|██████▍   | 20000/31215 [2:18:03<1:17:10,  2.42it/s]

{'loss': 0.0, 'learning_rate': 7.1856479256767585e-06, 'epoch': 0.64}


 66%|██████▌   | 20500/31215 [2:21:30<1:13:50,  2.42it/s]

{'loss': 0.0004, 'learning_rate': 6.865289123818677e-06, 'epoch': 0.66}


 67%|██████▋   | 21000/31215 [2:24:57<1:10:21,  2.42it/s]

{'loss': 0.0, 'learning_rate': 6.544930321960596e-06, 'epoch': 0.67}


 69%|██████▉   | 21500/31215 [2:28:24<1:06:54,  2.42it/s]

{'loss': 0.0, 'learning_rate': 6.224571520102516e-06, 'epoch': 0.69}


 70%|███████   | 22000/31215 [2:31:51<1:03:53,  2.40it/s]

{'loss': 0.0, 'learning_rate': 5.904212718244434e-06, 'epoch': 0.7}


 72%|███████▏  | 22500/31215 [2:35:18<1:00:07,  2.42it/s]

{'loss': 0.0, 'learning_rate': 5.583853916386354e-06, 'epoch': 0.72}


 74%|███████▎  | 23000/31215 [2:38:45<56:45,  2.41it/s]  

{'loss': 0.0001, 'learning_rate': 5.263495114528272e-06, 'epoch': 0.74}


 75%|███████▌  | 23500/31215 [2:42:12<53:07,  2.42it/s]

{'loss': 0.0001, 'learning_rate': 4.943136312670191e-06, 'epoch': 0.75}


 77%|███████▋  | 24000/31215 [2:45:39<49:39,  2.42it/s]

{'loss': 0.0, 'learning_rate': 4.62277751081211e-06, 'epoch': 0.77}


 78%|███████▊  | 24500/31215 [2:49:06<46:14,  2.42it/s]

{'loss': 0.0, 'learning_rate': 4.302418708954029e-06, 'epoch': 0.78}


 80%|████████  | 25000/31215 [2:52:32<42:42,  2.43it/s]

{'loss': 0.0, 'learning_rate': 3.9820599070959475e-06, 'epoch': 0.8}


 82%|████████▏ | 25500/31215 [2:55:59<39:27,  2.41it/s]

{'loss': 0.0, 'learning_rate': 3.6617011052378664e-06, 'epoch': 0.82}


 83%|████████▎ | 26000/31215 [2:59:26<35:56,  2.42it/s]

{'loss': 0.0, 'learning_rate': 3.3413423033797854e-06, 'epoch': 0.83}


 85%|████████▍ | 26500/31215 [3:03:12<43:17,  1.82it/s]

{'loss': 0.0, 'learning_rate': 3.0209835015217047e-06, 'epoch': 0.85}


 86%|████████▋ | 27000/31215 [3:07:51<39:38,  1.77it/s]

{'loss': 0.0002, 'learning_rate': 2.7006246996636236e-06, 'epoch': 0.86}


 88%|████████▊ | 27500/31215 [3:12:56<38:21,  1.61it/s]  

{'loss': 0.0, 'learning_rate': 2.3802658978055425e-06, 'epoch': 0.88}


 90%|████████▉ | 28000/31215 [3:18:44<28:23,  1.89it/s]  

{'loss': 0.0, 'learning_rate': 2.0599070959474614e-06, 'epoch': 0.9}


 91%|█████████▏| 28500/31215 [3:22:54<17:30,  2.58it/s]

{'loss': 0.0001, 'learning_rate': 1.73954829408938e-06, 'epoch': 0.91}


 93%|█████████▎| 29000/31215 [3:26:14<15:03,  2.45it/s]

{'loss': 0.0, 'learning_rate': 1.419189492231299e-06, 'epoch': 0.93}


 95%|█████████▍| 29500/31215 [3:29:38<11:41,  2.44it/s]

{'loss': 0.0, 'learning_rate': 1.098830690373218e-06, 'epoch': 0.95}


 96%|█████████▌| 30000/31215 [3:33:01<08:08,  2.49it/s]

{'loss': 0.0007, 'learning_rate': 7.78471888515137e-07, 'epoch': 0.96}


 98%|█████████▊| 30500/31215 [3:36:23<04:43,  2.52it/s]

{'loss': 0.0, 'learning_rate': 4.5811308665705594e-07, 'epoch': 0.98}


 99%|█████████▉| 31000/31215 [3:39:44<01:25,  2.51it/s]

{'loss': 0.0, 'learning_rate': 1.3775428479897487e-07, 'epoch': 0.99}


                                                       
100%|██████████| 31215/31215 [3:49:34<00:00,  2.27it/s]

{'eval_loss': 6.665878026979044e-05, 'eval_runtime': 504.9992, 'eval_samples_per_second': 27.473, 'eval_steps_per_second': 6.869, 'epoch': 1.0}
{'train_runtime': 13774.7543, 'train_samples_per_second': 9.064, 'train_steps_per_second': 2.266, 'train_loss': 9.235609688274127e-05, 'epoch': 1.0}





TrainOutput(global_step=31215, training_loss=9.235609688274127e-05, metrics={'train_runtime': 13774.7543, 'train_samples_per_second': 9.064, 'train_steps_per_second': 2.266, 'train_loss': 9.235609688274127e-05, 'epoch': 1.0})

In [10]:
trainer.save_model('../output/bert_predictor/final')

In [24]:
import torch
test_input = tokenizer(validation['reply'].iloc[151], return_tensors='pt').to(device)
with torch.no_grad():
    output = model(**test_input)

output.logits[0]

tensor([0.0377], device='cuda:0')