In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names
import csv
import random
import pickle

In [2]:
import transformers
from transformers import AutoModel, BertTokenizerFast

In [3]:
import torch
import torch.nn as nn

In [9]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [4]:
def nltk2wn_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:                    
        return None
def lemmatize_sentence(sentence):
    lem = WordNetLemmatizer()
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))    
    wn_tagged = map(lambda x: (x[0], nltk2wn_tag(x[1])), nltk_tagged)
    res_words = []
    for word, tag in wn_tagged:
        if tag is None:                        
            res_words.append(word)
        else:
            res_words.append(lem.lemmatize(word, tag))
    return " ".join(res_words)

def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'http\S+','',tweet)        # remove urls
    tweet = re.sub('[^a-z]',' ',tweet)
    tweet = lemmatize_sentence(tweet)

    tweet = word_tokenize(tweet)
    add_stop = ['religion','religious']

    #tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
    tweet = [word for word in tweet if not word in set(stopwords.words('english'))]
    tweet = [word for word in tweet if not word in set(names.words()) and not word in set(add_stop)]

    tweet = " ".join(tweet)
    return tweet

In [40]:
dataset = pd.read_pickle('final_data_manual.pickle')
dataset.sample(3)

Unnamed: 0,content,topic
9789,Sneak peek! https://t.co/rwIwMQZ8Ac,celebrity
20152,Another great zSpace customer getting started....,education
18771,@wildboydayo with verifiable facts &amp; not o...,education


In [41]:
cel_df = dataset[dataset.topic == 'celebrity'].sample(3000)
health_df = dataset[dataset.topic == 'health'].sample(3000)
sport_df = dataset[dataset.topic == 'sports'].sample(3000)
pol_df = dataset[dataset.topic == 'politics'].sample(3000)
rel_df = dataset[dataset.topic == 'religion'].sample(3000)
edu_df = dataset[dataset.topic == 'education'].sample(3000)

In [42]:
dataset = pd.DataFrame(np.row_stack([cel_df,health_df,sport_df,pol_df,rel_df,edu_df]),columns=['content','topic'])

In [43]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(dataset.content,dataset.topic,test_size=0.2,stratify=dataset.topic)

In [44]:
train_df = pd.DataFrame(np.column_stack([x_train,y_train]),columns=['content','topic'])

In [45]:
test_df = pd.DataFrame(np.column_stack([x_test,y_test]),columns=['content','topic'])

In [46]:
train_df.topic.value_counts()

health       2400
religion     2400
education    2400
celebrity    2400
politics     2400
sports       2400
Name: topic, dtype: int64

In [54]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

In [56]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 512

# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)


# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
train_encodings = tokenizer(train_df.content.to_list(), truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(test_df.content.to_list(), truncation=True, padding=True, max_length=max_length)

In [57]:
from sklearn.preprocessing import LabelEncoder
lb_train = LabelEncoder()
lb_val = LabelEncoder()

train_labels =  lb_train.fit_transform(train_df.topic)
val_labels =  lb_val.fit_transform(test_df.topic)

In [82]:
label_names = pd.DataFrame({'encode':train_labels,'true':train_df.topic})

In [84]:
target_names = {}
for i,k in zip(label_names.encode,label_names.true):
  target_names[i] = k

In [85]:
target_names

{0: 'celebrity',
 1: 'education',
 2: 'health',
 3: 'politics',
 4: 'religion',
 5: 'sports'}

In [58]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, train_labels)
valid_dataset = NewsGroupsDataset(valid_encodings, val_labels)

In [None]:
# load the model and pass to CUDA
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")

In [65]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [66]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=3000,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [67]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [68]:
# train the model
trainer.train()

***** Running training *****
  Num examples = 14400
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5400


Step,Training Loss,Validation Loss,Accuracy
400,1.4582,0.55427,0.945556
800,0.1717,0.065807,0.985833
1200,0.0646,0.060735,0.988611
1600,0.0928,0.071266,0.986111
2000,0.062,0.077076,0.985278
2400,0.0696,0.064503,0.989722
2800,0.0852,0.065763,0.989167
3200,0.0842,0.169483,0.969167
3600,0.0873,0.079144,0.986667
4000,0.0452,0.074785,0.988889


***** Running Evaluation *****
  Num examples = 3600
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-400
Configuration saved in ./results/checkpoint-400/config.json
Model weights saved in ./results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3600
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-800
Configuration saved in ./results/checkpoint-800/config.json
Model weights saved in ./results/checkpoint-800/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3600
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1200
Configuration saved in ./results/checkpoint-1200/config.json
Model weights saved in ./results/checkpoint-1200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3600
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1600
Configuration saved in ./results/checkpoint-1600/config.json
Model weights saved in ./results/checkpoint-1600/pytorch_model.bi

Step,Training Loss,Validation Loss,Accuracy
400,1.4582,0.55427,0.945556
800,0.1717,0.065807,0.985833
1200,0.0646,0.060735,0.988611
1600,0.0928,0.071266,0.986111
2000,0.062,0.077076,0.985278
2400,0.0696,0.064503,0.989722
2800,0.0852,0.065763,0.989167
3200,0.0842,0.169483,0.969167
3600,0.0873,0.079144,0.986667
4000,0.0452,0.074785,0.988889




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-5200 (score: 0.042330753058195114).


TrainOutput(global_step=5400, training_loss=0.17044621310852193, metrics={'train_runtime': 3729.3194, 'train_samples_per_second': 11.584, 'train_steps_per_second': 1.448, 'total_flos': 7725875823129600.0, 'train_loss': 0.17044621310852193, 'epoch': 3.0})

In [69]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 3600
  Batch size = 20


{'epoch': 3.0,
 'eval_accuracy': 0.9925,
 'eval_loss': 0.042330753058195114,
 'eval_runtime': 73.0285,
 'eval_samples_per_second': 49.296,
 'eval_steps_per_second': 2.465}

In [None]:
# saving the fine tuned model & tokenizer
model_path = "20newsgroups-bert-base-uncased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [114]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmaxs
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax().item()]

In [100]:
# Example #1
text = """ johnny depp and his wife amber herd at the court """
print(get_prediction(text))

celebrity


# Inference

In [13]:
def get_prediction(text):
    max_length = 512
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmaxs
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax().item()]

In [17]:
with open('bert_model/target_names.pickle','rb') as f:
    target_names = pickle.load(f)

## Load model from disk

In [11]:
tokenizer = BertTokenizerFast.from_pretrained('bert_model/',local_files_only=True)
model = BertForSequenceClassification.from_pretrained('bert_model/pytorch_model.bin',config='bert_model/config.json', local_files_only=True)

In [52]:
# Example #1
tweet = """ Johnny Depp victorious in defamation case against ex-wife Amber Heard """
text = preprocess_tweet(tweet)
print(f'{tweet} :  {get_prediction(text)}\n------------')

# Example #2
tweet = """ schools are places for mental thinking """
text = preprocess_tweet(tweet)
print(f'{tweet} :  {get_prediction(text)}\n------------')

# Example #3
tweet = """ disases caused too many people to die """
text = preprocess_tweet(tweet)
print(f'{tweet} :  {get_prediction(text)}\n------------')

# Example #4
tweet = """ Putin announced a "special military operation" to "demilitarise and denazify" Ukraine """
text = preprocess_tweet(tweet)
print(f'{tweet} :  {get_prediction(text)}\n----------')

# Example #5
tweet = """ Religious belief usually relates to faith in divine involvement in the universe and human life """
text = preprocess_tweet(tweet)
print(f'{tweet} :  {get_prediction(text)}\n------------')

# Example #6
tweet = """ The top-2 teams take on each other for the UAAP Season 84 Women's Volleyball title... fitting """
text = preprocess_tweet(tweet)
print(f'{tweet} :  {get_prediction(text)}\n')

 Johnny Depp victorious in defamation case against ex-wife Amber Heard  :  celebrity
------------
 schools are places for mental thinking  :  education
------------
 disases caused too many people to die  :  health
------------
 Putin announced a "special military operation" to "demilitarise and denazify" Ukraine  :  politics
----------
 Religious belief usually relates to faith in divine involvement in the universe and human life  :  religion
------------
 The top-2 teams take on each other for the UAAP Season 84 Women's Volleyball title... fitting  :  sports



In [164]:
import pickle

In [170]:
with open('target_names.pickle','wb') as f:
  pickle.dump(target_names,f,protocol=pickle.HIGHEST_PROTOCOL)

In [172]:
with open('target_names.pickle','rb') as f:
  p = pickle.load(f)