In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Imports and TPU setting

In [None]:
 ! pip install --upgrade kaggle -q
 ! pip install transformers -q
 ! pip install emoji -q
 ! pip install googletrans -q

In [None]:
import os
import re
import time
import numpy as np
import pandas as pd
import transformers
from tqdm import tqdm
import tensorflow as tf
from google.colab import files
import tensorflow_datasets as tfds
from transformers import BertTokenizer
from tensorflow.keras.models import Model
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from transformers import TFBertForSequenceClassification, BertConfig

from text import clean_text
from text_models import BertInputs

import matplotlib.pyplot as plt
%matplotlib inline

tf.get_logger().setLevel('ERROR')

#### Load the data

In [None]:
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d mswarbrickjones/reddit-selfposts

Saving kaggle.json to kaggle.json
Downloading reddit-selfposts.zip to /content
 98% 344M/352M [00:02<00:00, 144MB/s]
100% 352M/352M [00:02<00:00, 174MB/s]


In [None]:
!unzip '/content/reddit-selfposts.zip'

Archive:  /content/reddit-selfposts.zip
  inflating: rspct.tsv               
  inflating: subreddit_info.csv      


In [None]:
subreddit_df = pd.read_csv('/content/rspct.tsv', sep='\t')
subreddit_info_df = pd.read_csv('/content/subreddit_info.csv')

In [None]:
subreddit_df.head()

Unnamed: 0,id,subreddit,title,selftext
0,6d8knd,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi..."
1,58mbft,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...
2,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,..."


In [None]:
subreddit_info_df.head()

Unnamed: 0,subreddit,category_1,category_2,category_3,in_data,reason_for_exclusion
0,whatsthatbook,advice/question,book,,True,
1,CasualConversation,advice/question,broad,,False,too_broad
2,Clairvoyantreadings,advice/question,broad,,False,too_broad
3,DecidingToBeBetter,advice/question,broad,,False,too_broad
4,HelpMeFind,advice/question,broad,,False,too_broad


In [None]:
mapping = dict(subreddit_info_df[['subreddit', 'category_1']].values)
%time subreddit_df['Topic'] = subreddit_df.subreddit.map(mapping)
%time subreddit_df['TopicNum'] = subreddit_df.Topic.map(dict(zip(subreddit_info_df.category_1.unique(), range(subreddit_info_df.category_1.nunique()))))

%time subreddit_df['Text'] = subreddit_df.title + '. ' + subreddit_df.selftext

CPU times: user 133 ms, sys: 2.02 ms, total: 135 ms
Wall time: 135 ms
CPU times: user 76.6 ms, sys: 0 ns, total: 76.6 ms
Wall time: 76.2 ms
CPU times: user 730 ms, sys: 477 ms, total: 1.21 s
Wall time: 1.2 s


#### Preprocess

In [None]:
%time subreddit_df['clean_text'] = subreddit_df.Text.apply(lambda x: clean_text(x, remove_numbers=False))

CPU times: user 5min 49s, sys: 733 ms, total: 5min 49s
Wall time: 5min 49s


In [None]:
del subreddit_df['title']
del subreddit_df['selftext']
del subreddit_df['Text']

In [None]:
subreddit_df.head()

Unnamed: 0,id,subreddit,Topic,TopicNum,clean_text
0,6d8knd,talesfromtechsupport,writing/stories,45,remember your command line switches hi there ...
1,58mbft,teenmom,tv_show,43,so what was matt addicted to did he ever say w...
2,8f73s7,Harley,autos,5,no club colors funny story i went to college i...
3,6ti6re,ringdoorbell,hardware/tools,20,not door bell but floodlight mount height i kn...
4,77sxto,intel,electronics,16,worried about my 8700k small fftdata stress re...


In [None]:
train = subreddit_df[['clean_text', 'TopicNum']].iloc[:800000]
validation = subreddit_df[['clean_text', 'TopicNum']].iloc[800000:]

In [None]:
print(f"train shape: {train.shape} \nvalidation shape: {validation.shape}")
mean_word_len = train.clean_text.apply(lambda x: len(x.split(" "))).mean()
print(f"Dataset with shape of {train.shape[0]} samples. \nMean number of words is: {mean_word_len}. \nDistribution of lables is: \n{train.TopicNum.value_counts()}")

train shape: (800000, 2) 
validation shape: (213000, 2)
Dataset with shape of 800000 samples. 
Mean number of words is: 148.31615375. 
Distribution of lables is: 
44    78977
43    53682
21    45795
31    44217
38    41035
16    40253
25    33961
39    31572
36    24500
22    23697
12    22913
19    22897
11    22099
27    21337
2     20547
14    18176
45    17357
4     16598
32    16573
5     15811
0     14195
1     13449
15    13423
30    12651
37    12636
18    11862
10    11854
40    11059
20    11045
28    10274
34    10263
7      9471
3      8684
17     7894
23     7118
6      7085
35     5550
24     5537
42     3953
Name: TopicNum, dtype: int64


#### Modelling

###### Build model inputs

In [None]:
# Configuration
BATCH_SIZE = 16
MAX_LEN = 200

In [None]:
bert_inputs_train = BertInputs(texts=train.clean_text.astype(str), lables=train.TopicNum.values, max_length=MAX_LEN, batch_size=BATCH_SIZE, bert_model_name='bert-base-multilingual-uncased')
%time train_inputs = bert_inputs_train.process_examples(train=True)

bert_inputs_validation = BertInputs(texts=validation.clean_text.astype(str), lables=validation.TopicNum.values, max_length=MAX_LEN, batch_size=BATCH_SIZE, bert_model_name='bert-base-multilingual-uncased')
%time validation_inputs = bert_inputs_validation.process_examples(train=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…

0it [00:00, ?it/s]




800000it [36:15, 367.77it/s]


CPU times: user 1h 2min 14s, sys: 30.2 s, total: 1h 2min 45s
Wall time: 1h 2min 41s


213000it [09:39, 367.38it/s]


CPU times: user 16min 30s, sys: 5.11 s, total: 16min 35s
Wall time: 16min 31s


###### Build model

In [None]:
LR = 2e-5

EPOCHS = 2

model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-uncased', 
    config=BertConfig.from_pretrained('bert-base-multilingual-uncased', num_labels=46)
)

optimizer = tf.keras.optimizers.Adam(learning_rate=LR, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

my_callbacks = [
                tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=0, mode='min', baseline=None, restore_best_weights=True)
]
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=999358484.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  167356416 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  35374     
Total params: 167,391,790
Trainable params: 167,391,790
Non-trainable params: 0
_________________________________________________________________


###### Training

In [None]:
bert_history = model.fit(train_inputs, 
                         epochs=EPOCHS, 
                         validation_data=validation_inputs,
                         callbacks=my_callbacks)

Epoch 1/2
Epoch 2/2


In [None]:
model.save_pretrained('/content/drive/My Drive/projects/The reddit self-post classification task/bert_model')