# Things used
#### https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python
#### https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

In [1]:
import json
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [2]:
# Json path
json_path = '../../data/bad_user_tweets.json'
json_path_good = '../../data/good_user_tweets.json'

In [3]:
# Open json file
bad_user_tweets_file = open(json_path, 'r')
# Load json file
bad_user_tweets = json.load(bad_user_tweets_file)
bad_user_tweets = bad_user_tweets['IWTSeller']

# Open json file
good_user_tweets_file = open(json_path_good, 'r')
# Load json file
good_user_tweets = json.load(good_user_tweets_file)
good_user_tweets = good_user_tweets['notIWTSeller']

In [4]:
for tweet in good_user_tweets:
	print(tweet['tweet'])

RT @NFTGUYY: HUGE BORED APE GIVEAWAY 🚀🚨

I AM GIVING AWAY 3 MUTANT @BoredApeYC NFTS! #NFTGiveaway
 
💰Floor price = 6 ETH (each)💰
($28.300)…
RT @donkey_dao: 3 x Donkey NFT Giveaway

HEE-HAAW! We are celebrating the opening of our Discord! 

Presale Spots are now available. (350 P…
RT @cake__work: GM! To celebrate the upcoming presale for MIMO, I am giving away a sockpass that will get you access to the round 2 presale…
@NielsKeira Hampool Enterprise  Co., Ltd,China. 
We produce Heat Shrink Tubing&amp; Heat Shrink solder sleeve &amp; Heat shrink Terminal &amp; Pre-insulated connectors &amp; Cable ties &amp; non-shrink tubing &amp; PVC Tape.
Contact :+8618017673991（WhtasApp, Wechat, Viber)
sales9@hampool.com
HP-CGT
Corrugated Tubing
Material : PP
Temperature Range : -40 C to 135 C
Enormous cold impact strength
Feel free to contact me if you need it
Contact : +86-18017673991(WhatsApp, Wechat, Viber)
Email : sales9@hampool.com
#CorrugatedTubing
#shrinksleeve
#shrinkterminal https://t.co/Sth

In [5]:
# print user tweets
for tweet in bad_user_tweets:
	print(tweet['tweet'])

Mammoth Ivory Carvings Figurine of Japanese Samurai

Look at this mighty yet robust creation of Japanese Samurai Figurines on a Wooden platform precisely sculpted with original mammoth ivory.

https://t.co/kbjbZ65Mrf https://t.co/xRZQLBdzDZ
Mammoth Ivory Netsuke – 12 zodiac animals set – Round Stand

This is a very high quality mammoth ivory carving netsukes of 12 Chinese Zodiac animals Set!

Check the link: https://t.co/sjc9zvWslb https://t.co/ZhIyx7QkOm
Mammoth Ivory Masterpiece Netsuke – Father &amp; Son with a Fish
The relationship between father and son is a classic and timeless relationship worthy of artistic interpretation.

https://t.co/6V9gHq54kk

#masterpiece #sculpture #art https://t.co/NiJygsdnir
Mammoth Ivory Figurine -The New archery samurai kneeling

Genuine MAMMOTH IVORY FIGURINE portraying a Samurai kneeling down, about to shoot an arrow.

#artcollector #ivory #sculptures

https://t.co/796CozjBdi https://t.co/DHjrgjuRC3
What A beautiful year it has been! We can only sa

In [6]:
#remove punctuation
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

table = str.maketrans('', '', string.punctuation)

# Remove stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')

# Lemmatize all words
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()	

[nltk_data] Downloading package stopwords to /home/gabe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gabe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/gabe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
bad_tweet_text_list = []
for tweet in bad_user_tweets:
    tweet_text = tweet['tweet']
    tweet_text = ' '.join([word for word in tweet_text.lower().split() if word not in stop]) #remove stopwords
    tweet_text = tweet_text.translate(table) #remove punc.
    tweet_text = ''.join(c for c in tweet_text if not c.isdigit()) #remove numbers
    tweet_text = tweet_text.replace('  ', ' ') #remove double spaces
    tweet_text = ' '.join([lemmatizer.lemmatize(word) for word in tweet_text.split()]) #lemminize
    bad_tweet_text_list.append(tweet_text)

In [8]:
good_tweet_text_list = []
for tweet in good_user_tweets:
	tweet_text = tweet['tweet']
	tweet_text = ' '.join([word for word in tweet_text.lower().split() if word not in stop]) #remove stopwords
	tweet_text = tweet_text.translate(table) #remove punc.
	tweet_text = ''.join(c for c in tweet_text if not c.isdigit()) #remove numbers
	tweet_text = tweet_text.replace('  ', ' ') #remove double spaces
	tweet_text = ' '.join([lemmatizer.lemmatize(word) for word in tweet_text.split()]) #lemminize
	good_tweet_text_list.append(tweet_text)

In [9]:
# Combine bad and good tweets
tweet_combined_ground_truth = list(bad_tweet_text_list + good_tweet_text_list)
print(len(tweet_combined_ground_truth))
print(tweet_combined_ground_truth[0])


123
mammoth ivory carving figurine japanese samurai look mighty yet robust creation japanese samurai figurine wooden platform precisely sculpted original mammoth ivory httpstcokbjbzmrf httpstcoxrzqlbdzdz


In [10]:
# Create the ground truth labels
bad_labels = [1] * len(bad_user_tweets)
good_labels = [0] * len(good_user_tweets)
# Combine the labels
labels = bad_labels + good_labels
# Convert to numpy array
labels = np.array(labels)

In [11]:
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 512
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [12]:
train_encodings = []
for tweet in tweet_combined_ground_truth:
	train_encodings.append(tokenizer(tweet, truncation=True, padding=True, max_length=max_length))

In [13]:
print(len(train_encodings))
print(len(labels))

123
123


In [14]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [15]:
from sklearn.model_selection import train_test_split
x_train, y_train, x_test, y_test = train_test_split(train_encodings, labels, test_size=0.2)

In [16]:
train_dataset = Dataset(x_train, y_train)
test_dataset = Dataset(x_test, y_test)

In [17]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [18]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [19]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [20]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,        # training dataset
    eval_dataset=test_dataset,          # evaluation dataset
    # the callback that computes metrics of interest
    compute_metrics=compute_metrics,
)


In [21]:
trainer.train()

TypeError: list indices must be integers or slices, not str

In [None]:
# import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')

In [None]:
# tweet_text_vectors = []
# for tweet in tweet_text_list:
# 	for word in tweet.split():
# 			try:
# 				tweet_text_vectors.append(np.asarray(wv[word]).astype('float32'))
# 			except:
# 				print(f'Word {word} not in vocabulary')