# Things used
#### https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python
#### https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

In [1]:
import json
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [2]:
# Get device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Json path
json_path = '../../data/bad_user_tweets.json'
json_path_good = '../../data/random_users_timeline.json'

In [4]:
# Open json file
bad_user_tweets_file = open(json_path, 'r')
# Load json file
bad_user_tweets = json.load(bad_user_tweets_file)
bad_user_tweets = bad_user_tweets['IWTSeller']

# Open json file
good_user_tweets_file = open(json_path_good, 'r')
# Load json file
good_user_tweets = json.load(good_user_tweets_file)
# good_user_tweets = good_user_tweets['notIWTSeller']

In [6]:
# print user tweets
for tweet in bad_user_tweets:
	print(tweet['tweet'])

Mammoth Ivory Carvings Figurine of Japanese Samurai

Look at this mighty yet robust creation of Japanese Samurai Figurines on a Wooden platform precisely sculpted with original mammoth ivory.

https://t.co/kbjbZ65Mrf https://t.co/xRZQLBdzDZ
Mammoth Ivory Netsuke – 12 zodiac animals set – Round Stand

This is a very high quality mammoth ivory carving netsukes of 12 Chinese Zodiac animals Set!

Check the link: https://t.co/sjc9zvWslb https://t.co/ZhIyx7QkOm
Mammoth Ivory Masterpiece Netsuke – Father &amp; Son with a Fish
The relationship between father and son is a classic and timeless relationship worthy of artistic interpretation.

https://t.co/6V9gHq54kk

#masterpiece #sculpture #art https://t.co/NiJygsdnir
Mammoth Ivory Figurine -The New archery samurai kneeling

Genuine MAMMOTH IVORY FIGURINE portraying a Samurai kneeling down, about to shoot an arrow.

#artcollector #ivory #sculptures

https://t.co/796CozjBdi https://t.co/DHjrgjuRC3
What A beautiful year it has been! We can only sa

In [7]:
#remove punctuation
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

table = str.maketrans('', '', string.punctuation)

# Remove stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')

# Lemmatize all words
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()	

[nltk_data] Downloading package stopwords to /home/gabe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gabe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/gabe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
bad_tweet_text_list = []
for tweet in bad_user_tweets:
    tweet_text = tweet['tweet']
    tweet_text = ' '.join([word for word in tweet_text.lower().split() if word not in stop]) #remove stopwords
    tweet_text = tweet_text.translate(table) #remove punc.
    tweet_text = ''.join(c for c in tweet_text if not c.isdigit()) #remove numbers
    tweet_text = tweet_text.replace('  ', ' ') #remove double spaces
    tweet_text = ' '.join([lemmatizer.lemmatize(word) for word in tweet_text.split()]) #lemminize
    bad_tweet_text_list.append(tweet_text)

In [15]:
good_tweet_text_list = []
for user in good_user_tweets:
	try:
		for tweet in good_user_tweets[user]['data']:
			tweet_text = tweet['text']
			tweet_text = ' '.join([word for word in tweet_text.lower().split() if word not in stop]) #remove stopwords
			tweet_text = tweet_text.translate(table) #remove punc.
			tweet_text = ''.join(c for c in tweet_text if not c.isdigit()) #remove numbers
			tweet_text = tweet_text.replace('  ', ' ') #remove double spaces
			tweet_text = ' '.join([lemmatizer.lemmatize(word) for word in tweet_text.split()]) #lemminize
			good_tweet_text_list.append(tweet_text)
	except:
		print("Error")

Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error


In [17]:
print(len(good_tweet_text_list))
print(good_tweet_text_list[0])

4896
tide pool excretion schoolnewsletter


In [16]:
# Combine bad and good tweets
tweet_combined_ground_truth = bad_tweet_text_list + good_tweet_text_list
print(len(tweet_combined_ground_truth))
print(tweet_combined_ground_truth[0])


4960
mammoth ivory carving figurine japanese samurai look mighty yet robust creation japanese samurai figurine wooden platform precisely sculpted original mammoth ivory httpstcokbjbzmrf httpstcoxrzqlbdzdz


In [10]:
# Create the ground truth labels
bad_labels = [1] * len(bad_user_tweets)
good_labels = [0] * len(good_user_tweets)
# Combine the labels
labels = bad_labels + good_labels

In [11]:
print(len(labels))

123


In [12]:
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 512
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [13]:
from sklearn.model_selection import train_test_split
x_train, y_train, x_test, y_test = train_test_split(tweet_combined_ground_truth, labels, test_size=0.2)

In [14]:
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

98
25
98
25


In [15]:
train_encodings = tokenizer(x_train, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(y_train, truncation=True, padding=True, max_length=max_length)

In [16]:
# print(train_encodings)
print(type(train_encodings))
print(len(y_train))

<class 'transformers.tokenization_utils_base.BatchEncoding'>
25


In [17]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # print(item)
        # print("Number of encodings: " + str(len(self.encodings)))
        # print(str(idx) + ' ' + str(len(self.labels)))
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [18]:
train_dataset = Dataset(train_encodings, x_test)
test_dataset = Dataset(test_encodings, y_test)

In [19]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [21]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [22]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,        # training dataset
    eval_dataset=test_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,  # the callback that computes metrics of interest
)


In [23]:
trainer.train()

***** Running training *****
  Num examples = 98
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39
100%|██████████| 39/39 [02:00<00:00,  2.65s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 39/39 [02:00<00:00,  3.08s/it]

{'train_runtime': 120.2251, 'train_samples_per_second': 2.445, 'train_steps_per_second': 0.324, 'train_loss': 0.6571747217422876, 'epoch': 3.0}





TrainOutput(global_step=39, training_loss=0.6571747217422876, metrics={'train_runtime': 120.2251, 'train_samples_per_second': 2.445, 'train_steps_per_second': 0.324, 'train_loss': 0.6571747217422876, 'epoch': 3.0})

In [24]:
# import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')

In [25]:
# tweet_text_vectors = []
# for tweet in tweet_text_list:
# 	for word in tweet.split():
# 			try:
# 				tweet_text_vectors.append(np.asarray(wv[word]).astype('float32'))
# 			except:
# 				print(f'Word {word} not in vocabulary')