In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
import torchvision
import transformers
from transformers import DistilBertModel, DistilBertTokenizer, logging

logging.set_verbosity_error()  # Ignore warning on model loading.
bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# I will compare 3 models in my project: BERT, BERT + Logreg, BERT + RandomForest and then use the best one to score test data

# Bert is an excellent model for such a task. I'm sure that the naive Bayesian classifier, usage of TF-IDF, Bag-of-Words
# with classic models will be much worse so I decided to compare BERT in 3 variants (alone, with lr, with rf)

In [None]:
# Process data to work with them later

train_spam = pd.read_csv(r'train_spam.csv')
test_spam = pd.read_csv(r'test_spam.csv')
label_encoder = LabelEncoder()
train_spam['text_type'] = label_encoder.fit_transform(train_spam['text_type']) # 1 will be spam and 0 will be ham (not spam)
train_texts = train_spam['text'].tolist()
train_labels = train_spam['text_type'].values

In [None]:
# Basic analytics:

# Derive information from dataframes
print(train_spam.info())
print(test_spam.info())

# Look at the first 5 rows
print(train_spam.head())
print(test_spam.head())

# Find out the number of messages with and without spam in the train
print(train_spam['text_type'].value_counts())

# Consider the average length of the text for spam and non-spam messages
spam_text_len = train_spam[train_spam['text_type'] == 'spam']['text'].str.len().mean()
not_spam_text_len = train_spam[train_spam['text_type'] != 'spam']['text'].str.len().mean()
print(f"the average length for spam: {spam_text_len:.2f}")
print(f"the average length for non-spam: {not_spam_text_len:.2f}")

# Getting rows with missing values in any column (if any)
print(train_spam[train_spam.isnull().any(axis=1)])

In [None]:
#BERT

tokenized_texts = bert_tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt", max_length=500)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
batch_size = 32
features = []
with torch.no_grad():
    for i in range(0, len(train_texts), batch_size):
        texts_batch = tokenized_texts["input_ids"][i : i + batch_size].to(device)
        masks_batch = tokenized_texts["attention_mask"][i : i + batch_size].to(device)
        output = bert_model(texts_batch, masks_batch)
        batch_features = output.last_hidden_state[:, 0, :].cpu().numpy()
        features.append(batch_features)

features = np.concatenate(features, axis=0) # output which will be used further

In [None]:
# Fitting of models

import warnings

from sklearn.model_selection import train_test_split # split our data
train_labels = train_spam['text_type'].values
train_features, test_features, train_labels, test_labels = train_test_split(features, train_labels, test_size=0.2, random_state=42)

# Just BERT:
# there is no need to add anything

# BERT + LogisticRegression:
from sklearn.linear_model import LogisticRegression
warnings.simplefilter('ignore')  # Ignore warning on model fitting.
lr_clf = LogisticRegression().fit(train_features, train_labels)

# BERT + RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
warnings.simplefilter('ignore')
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42).fit(train_features, train_labels)

In [None]:
# Comparison of models

from sklearn.metrics import roc_auc_score
import torch.nn.functional as F

# just BERT:
features_tensor = torch.from_numpy(features)
probs = features_tensor.softmax(dim=-1)
predicted_values = torch.zeros(len(test_features), dtype=torch.int64)
for i in range(len(test_features)):
    positive_prob = probs[i][1].item()  # Probability of positive class
    if positive_prob > 0.5:
        predicted_values[i] = 1 
    else:
        predicted_values[i] = 0
auc = roc_auc_score(test_labels, predicted_values)
print("Just BERT: ", auc)

# BERT + LogisticRegression
proba_lr = lr_clf.predict_proba(test_features)[:, 1]
auc_lr = roc_auc_score(test_labels, proba_lr)
print("BERT + LogisticRegression: ", auc_lr)

# BERT + RandomForestClassifier
label_pred_rf = rf_clf.predict(test_features)
auc_rf = roc_auc_score(test_labels, label_pred_rf)
print("BERT + RandomForestClassifier: ", auc_rf)

In [None]:
# BERT + LogisticRegression showed the highest quality on test data: roc_auc_score = 0.9869210901625936

# Let's use it to score test_spam


# To be honest, I started doing this task too late, so I didn't manage to fix the problem with a variant "Just BERT", 
# but 0.5 is definitely not normal auc for this model

In [None]:
# Scoring of test data

test_texts = test_spam['text'].tolist() # the next part of code we have seen before when created output of BERT on train
tokenized_test_texts = bert_tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=500)
batch_size = 32
features_test = []
with torch.no_grad():
    for i in range(0, len(test_texts), batch_size):
        texts_test_batch = tokenized_test_texts["input_ids"][i : i + batch_size].to(device)
        masks_test_batch = tokenized_test_texts["attention_mask"][i : i + batch_size].to(device)
        output_test = bert_model(texts_test_batch, masks_test_batch)
        batch_features_test = output_test.last_hidden_state[:, 0, :].cpu().numpy()
        features_test.append(batch_features_test)

features_test = np.concatenate(features_test, axis=0)
proba_lr_test = lr_clf.predict_proba(features_test)[:, 1] # predict probas
threshold = 0.5
labels = np.where(proba_lr_test >= threshold, 1, 0)
labels = ['spam' if label == 1 else 'not spam' for label in labels]

# write the answer to a file
answer_df = pd.DataFrame(test_spam['text'])
answer_df['label'] = labels
answer_df.to_csv('scoring_of_test_spam.csv', index=False)