# Things used
#### https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python

In [3]:
import json
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import os
import requests
import operator
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [4]:
# Json path
json_path = '../../data/bad_user_tweets.json'

In [5]:
# Open json file
bad_user_tweets_file = open(json_path, 'r')
# Load json file
bad_user_tweets = json.load(bad_user_tweets_file)
bad_user_tweets = bad_user_tweets['IWTSeller']

In [6]:
# print user tweets
for tweet in bad_user_tweets:
	print(tweet['tweet'])

Mammoth Ivory Carvings Figurine of Japanese Samurai

Look at this mighty yet robust creation of Japanese Samurai Figurines on a Wooden platform precisely sculpted with original mammoth ivory.

https://t.co/kbjbZ65Mrf https://t.co/xRZQLBdzDZ
Mammoth Ivory Netsuke – 12 zodiac animals set – Round Stand

This is a very high quality mammoth ivory carving netsukes of 12 Chinese Zodiac animals Set!

Check the link: https://t.co/sjc9zvWslb https://t.co/ZhIyx7QkOm
Mammoth Ivory Masterpiece Netsuke – Father &amp; Son with a Fish
The relationship between father and son is a classic and timeless relationship worthy of artistic interpretation.

https://t.co/6V9gHq54kk

#masterpiece #sculpture #art https://t.co/NiJygsdnir
Mammoth Ivory Figurine -The New archery samurai kneeling

Genuine MAMMOTH IVORY FIGURINE portraying a Samurai kneeling down, about to shoot an arrow.

#artcollector #ivory #sculptures

https://t.co/796CozjBdi https://t.co/DHjrgjuRC3
What A beautiful year it has been! We can only sa

In [7]:
#remove punctuation
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

table = str.maketrans('', '', string.punctuation)

# Remove stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')

# Lemmatize all words
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()	

[nltk_data] Downloading package stopwords to /home/gabe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gabe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/gabe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
tweet_text_list = []
for tweet in bad_user_tweets:
    tweet_text = tweet['tweet']
    tweet_text = ' '.join([word for word in tweet_text.lower().split() if word not in stop]) #remove stopwords
    tweet_text = tweet_text.translate(table) #remove punc.
    tweet_text = ''.join(c for c in tweet_text if not c.isdigit()) #remove numbers
    tweet_text = tweet_text.replace('  ', ' ') #remove double spaces
    tweet_text = ' '.join([lemmatizer.lemmatize(word) for word in tweet_text.split()]) #lemminize
    tweet_text_list.append(tweet_text)

In [9]:
print(tweet_text_list)

['mammoth ivory carving figurine japanese samurai look mighty yet robust creation japanese samurai figurine wooden platform precisely sculpted original mammoth ivory httpstcokbjbzmrf httpstcoxrzqlbdzdz', 'mammoth ivory netsuke – zodiac animal set – round stand high quality mammoth ivory carving netsukes chinese zodiac animal set check link httpstcosjczvwslb httpstcozhiyxqkom', 'mammoth ivory masterpiece netsuke – father amp son fish relationship father son classic timeless relationship worthy artistic interpretation httpstcovghqkk masterpiece sculpture art httpstconijygsdnir', 'mammoth ivory figurine the new archery samurai kneeling genuine mammoth ivory figurine portraying samurai kneeling down shoot arrow artcollector ivory sculpture httpstcocozjbdi httpstcodhjrgjurc', 'beautiful year been say thanks customer wishing great evening family friend may bring good health joy ivoryandart ivorycarving mammothivory legalivory carveivory netsuke mammothivorytusk httpstcoxiqtixkre', 'really am

In [10]:
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 512
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [11]:
train_encodings = tokenizer(tweet_text_list, truncation=True, padding=True, max_length=max_length)

In [12]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Downloading: 100%|██████████| 420M/420M [00:16<00:00, 27.1MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenc

In [18]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [15]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [13]:
# import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')

In [14]:
# tweet_text_vectors = []
# for tweet in tweet_text_list:
# 	for word in tweet.split():
# 			try:
# 				tweet_text_vectors.append(np.asarray(wv[word]).astype('float32'))
# 			except:
# 				print(f'Word {word} not in vocabulary')