In [1]:
#install required packages
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.2 MB/s[0m eta [36m0:00:0

In [2]:
#test GPU device
import tensorflow as tf
tf.test.gpu_device_name()
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 3474403975831066044
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14328594432
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 17271341544209284054
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
 xla_global_id: 416903419]

In [3]:
import torch
import torch.nn as nn
# Define a custom neural network class
class ConvertModel(nn.Module):
    def __init__(self, input_dim, output_dim,N):
        super(ConvertModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.N = N

    def forward(self, x):
        N = self.N
        x = x.view(N, -1)  # Reshape the input tensor
        x = self.linear(x)
        return x

# Create an instance of the custom class
def tensor_3d_to_2d(tensor):
  N = tensor.shape[0]
  M = tensor.shape[1]
  input_dim = M * 768
  output_dim = 768
  print(N,M)
  tensor_model = ConvertModel(input_dim, output_dim, N)
  converted_tensor = tensor_model(tensor)

  return converted_tensor




In [None]:
#testing convert 3d to 2d NN function
# tensor_NxMx768 = torch.randn(10, 5, 768)
# tensor_3d_to_2d(tensor_NxMx768).shape


10 5


torch.Size([10, 768])

In [4]:
#add bert model
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.nn.functional import softmax
import torch
#import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score


In [5]:
def proprocess_tweets(tweets):
  processed_tweets = [preprocessTweet1(tweet) for tweet in tweets]
  return processed_tweets
#add preprocessing step to clean the tweets
import re
def preprocessTweet1(tweet):
    # process the tweets

    # Remove 'RT' from tweet
    tweet = re.sub(r'RT[\s]+', '', tweet)
    # Remove &amp; (ampersand)
    tweet = re.sub(r'&amp;', 'and', tweet)
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    #Convert @username to @USER
    tweet = re.sub('@[^\s]+','',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    # Remove all emojis.
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

    tweet = emoji_pattern.sub(r'', tweet)

    # Remove redundant spaces
    tweet = re.sub(r'\s+', ' ', tweet).strip()

    #remove unknown characters
    tweet = ''.join([c for c in tweet if ord(c) < 128])

    #trim
    tweet = tweet.strip('\'"')
    return tweet



In [6]:
#define the tokenize, padding, masking procedure from text to tensor

def text_to_tensor(data):
  #tokenize
  tokenized = data['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
  #padding
  max_len = 0
  for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
  padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
  print("max len for padding: ",max_len, np.array(padded).shape)
  #Masking the paddding
  attention_mask = np.where(padded != 0, 1, 0)
  # print("attention mask shape:",attention_mask.shape)
  #Use pretrained Bert model to word embedding all the sentences
  input_ids = torch.tensor(padded)
  attention_mask = torch.tensor(attention_mask)

  with torch.no_grad():
      last_hidden_states = model(input_ids, attention_mask=attention_mask)
  # result =last_hidden_states

  return last_hidden_states[0]

In [7]:
# clean data
def clean_data(data):
  #clean tweets
  data['text'] = proprocess_tweets(data['text'])
  return(data)


In [8]:

#balance the data
def balanced_data(data):

  data_ade = data[data['label'] == 1]
  data_noade = data[data['label'] == 0]
  # print(data_ade.shape,data_noade.shape)
  #downsampling
  data_noade_downsampled = data_noade.sample(data_ade.shape[0])
  data_balanced = pd.concat([data_ade,data_noade_downsampled])
  # print(t_batch_balanced['label'].value_counts())
  #proprecess the tweets
  # print("before clean",t_batch_balanced)
  # #clean tweets
  # data_balanced['text'] = proprocess_tweets(data_balanced['text'])
  # print("after clean  ",data_balanced)
  return data_balanced



In [9]:
#import dataset
# test_df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
#import dataset from github
ADETrainingdata = pd.read_csv('https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADEClassification/ADETraining.tsv', delimiter='\t')
ADEValidationdata = pd.read_csv('https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADEClassification/ADEValidation.tsv', delimiter='\t')

In [10]:
print("Training data shape:",ADETrainingdata.shape)
print("Validation data shape:",ADEValidationdata.shape)

Training data shape: (7600, 3)
Validation data shape: (400, 3)


In [11]:
#import normalization data
RawTweets = pd.read_csv('https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADENormalization/Dev/tweets.tsv',delimiter='\t')
SpansNorm = pd.read_csv('https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADENormalization/Dev/spans_norm.tsv',delimiter='\t')

#check the tweets id in Spans Norm and Raw Tweets
# print("get the tweets id in Raw Tweets:\n",RawTweets.iloc[:,0])
# print("Get the tweets id in Spans Norm:\n",SpansNorm.iloc[:,0])
list1 = RawTweets.iloc[:,0]
list2 = SpansNorm.iloc[:,0]
common_values = set(list1).intersection(set(list2))
# print("common tweets id:",common_values)
# print("Total tweets:",len(set(list1)),"Tweets with AE:",len(set(list2)),"matched tweets:",len(common_values))

In [12]:
# train the ADE model with 7600 ADETrainingData
#print(ADETrainingdata.head() , ADEValidationdata.head())
#use the first 2000 rows of the training data
# ADETrainingdata[:2000]
# ADETrainingdata.iloc[:3,-3:]
#ADETrainingdata['tweet_id'][:200]
#only use the first 300 tweets in Training data as training
# print(ADETrainingdata['label'][:300].value_counts())
# training_batch_1 =ADETrainingdata[:300]
training_batch_1 =ADETrainingdata

# print(training_batch_1.head())
print("Training data shape",training_batch_1.shape)
print("Training data label distribution:\n",training_batch_1['label'].value_counts())

#testing dataset from validation dataset
#use first 100 for coding
# print(ADEValidationdata['label'][:100].value_counts())
testing_batch_1 =ADEValidationdata
print("Testing data shape",testing_batch_1.shape)
print("Testing data label distribution:\n",testing_batch_1['label'].value_counts())
# print(testing_batch_1.head())

Training data shape (7600, 3)
Training data label distribution:
 0    6266
1    1334
Name: label, dtype: int64
Testing data shape (400, 3)
Testing data label distribution:
 0    343
1     57
Name: label, dtype: int64


In [13]:
# proprecess training data: cleaning the tweets text
#balance data
# bal_training_data = balanced_data(training_batch_1)
#clean data
# clean_training_data = clean_data(bal_training_data)
#clean the training data tweets
clean_training_data = clean_data(training_batch_1)

# print(bal_data)
print("training data shape:",training_batch_1.shape,"clean training data shape",clean_training_data.shape)

# proprecess testing data
#all clean testing data unbalanced
clean_testing_data = clean_data(testing_batch_1)
#all testing data
print("testing data shape:",testing_batch_1.shape,"clean testing data shape:",clean_testing_data.shape)
#balance testing data
# bal_testing_data = balanced_data(testing_batch_1)
#clean balanced testing data
# clean_testing_data = clean_data(bal_testing_data)

# print("testing data shape:",testing_batch_1.shape,"all clean testing data shape:",all_testing_data.shape,
#       "clean balanced testing data shape:",clean_testing_data.shape)


training data shape: (7600, 3) clean training data shape (7600, 3)
testing data shape: (400, 3) clean testing data shape: (400, 3)


In [14]:
#load bert model. load tokenizer and model
#Testing small bert model 'distilbert-base-uncased'
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
#testing pubmed bert model
# model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'ml4pubmed/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_pub_section')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# print(clean_training_data)
# calculate the time to do tokenization
# first step to convert the text into tensor
import time
start_time = time.time()
features_train_1 = text_to_tensor(clean_training_data)
features_test_1 = text_to_tensor(clean_testing_data)
end_time = time.time()
tokenize_time = end_time - start_time
print(f"Running time: {tokenize_time:.4f} seconds")

print("training features dimension",features_train_1.shape)
print("testing features dimension",features_test_1.shape)

max len for padding:  114 (7600, 114)


In [None]:
#get the feature dimensions for the tensor
print(fe_train_1.shape,fe_test_1.shape,fe_test_2.shape)

torch.Size([2668, 88, 768]) torch.Size([24, 79, 768]) torch.Size([100, 82, 768])


In [None]:
#add NN layer to convert 3D tensor to 2D tensor
print(fe_train_1.shape, fe_test_1.shape, fe_test_2.shape)
fe_train_1_2d = tensor_3d_to_2d(fe_train_1).detach().numpy()
fe_test_1_2d = tensor_3d_to_2d(fe_test_1).detach().numpy()
fe_test_2_2d = tensor_3d_to_2d(fe_test_2).detach().numpy()

print(fe_train_1_2d.shape, fe_test_1_2d.shape, fe_test_2_2d.shape)



torch.Size([90, 83, 768]) torch.Size([24, 76, 768]) torch.Size([100, 82, 768])
90 83
24 76
100 82
(90, 768) (24, 768) (100, 768)


In [None]:
print(fe_train_1.shape)
test_fe2d = fe_train_1[:,0,:].numpy()
print(test_fe2d.shape)

torch.Size([2668, 88, 768])
(2668, 768)


In [None]:
#split the features and labels in training
train_features, test_features, train_labels, test_labels = train_test_split(fe_train_1_2d,  clean_training_data['label'], test_size = 0.2, random_state = 42)
print("training feature dimension",train_features.shape)
print("testing feature dimension",test_features.shape)
print("training label dimension",train_labels.shape)
print("training label dimension",test_labels.shape)


training feature dimension (72, 768)
testing feature dimension (18, 768)
training label dimension (72,)
training label dimension (18,)


In [None]:
#refine parameters for logisticRegression model
parameters = {'C': np.linspace(0.0001, 100, 10)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 44.444500000000005}
best scrores:  0.6257142857142857


In [None]:
lr_clf = LogisticRegression(C=grid_search.best_params_['C'])
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)

0.5555555555555556

In [None]:
lr_clf.score(train_features, train_labels)

1.0

In [None]:
lr_clf.fit(train_features, train_labels)

In [None]:
from sklearn.metrics import precision_recall_curve, accuracy_score, f1_score

#testing dataset
test_features_ba, test_labels_ba = fe_test_1_2d,  clean_testing_data['label']
print(test_features_ba.shape,test_labels_ba.shape)
#use the model to predict
predict_test_ba_labels = lr_clf.predict(test_features_ba)
precision = precision_score(test_labels_ba, predict_test_ba_labels)
recall = recall_score(test_labels_ba, predict_test_ba_labels)
accuracy =accuracy_score (test_labels_ba, predict_test_ba_labels)
f1 = f1_score(test_labels_ba, predict_test_ba_labels)


print("Precision: ", precision)
print("Recall: ", recall)
print("accuracy:",accuracy)
print("F1:",accuracy)

precision, recall , thresholds = precision_recall_curve(test_labels_ba, predict_test_ba_labels)


(24, 768) (24,)
Precision:  0.0
Recall:  0.0
accuracy: 0.5
F1: 0.5


In [None]:
from sklearn.metrics import precision_recall_curve, accuracy_score, f1_score

predict_test_labels = lr_clf.predict(test_features)
precision = precision_score(test_labels, predict_test_labels)
recall = recall_score(test_labels, predict_test_labels)
accuracy =accuracy_score (test_labels, predict_test_labels)
f1 = f1_score(test_labels, predict_test_labels)


print("Precision: ", precision)
print("Recall: ", recall)
print("accuracy:",accuracy)
print("F1:",accuracy)

precision, recall , thresholds = precision_recall_curve(test_labels, predict_test_labels)
thresholds
# print(test_labels.shape)
# print(predict_test_labels.shape)
tp = np.sum((predict_test_labels == 1) & (test_labels == 1))
tn = np.sum((predict_test_labels == 0) & (test_labels == 0))
fp = np.sum((predict_test_labels == 0) & (test_labels == 1))
fn = np.sum((predict_test_labels == 1) & (test_labels == 0))


print(test_labels)
print(predict_test_labels)
print("tp:",tp,"tn:",tn,"fp:",fp,"fn:",fn)

Precision:  0.8333333333333334
Recall:  0.4166666666666667
accuracy: 0.5555555555555556
F1: 0.5555555555555556


array([0, 1])

In [None]:
# print(test_labels.shape)
# print(predict_test_labels.shape)
tp = np.sum((predict_test_labels == 1) & (test_labels == 1))
tn = np.sum((predict_test_labels == 0) & (test_labels == 0))
fp = np.sum((predict_test_labels == 0) & (test_labels == 1))
fn = np.sum((predict_test_labels == 1) & (test_labels == 0))


print(test_labels)
print(predict_test_labels)
print("tp:",tp,"tn:",tn,"fp:",fp,"fn:",fn)

282    1
148    1
186    0
16     0
6      1
187    1
280    1
225    0
60     1
299    1
278    0
241    1
149    0
231    0
70     1
22     1
121    1
197    1
Name: label, dtype: int64
[0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 1]
tp: 5 tn: 5 fp: 7 fn: 1


In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.542 (+/- 0.05)
