### Using COBERT to classify examples from the big dataset

In [1]:
import numpy as np
import pandas as pd
import regex as re
import string

import torch
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW
)

import random

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
max_length = 96

In [3]:
raw_data = pd.read_csv('new_tweets_only_txt.csv')

In [4]:
df_txt = raw_data['text']
df_txt.dropna()
df_txt = df_txt.astype(str)
# df_txt.to_csv('new_tweets_only_txt.csv')

In [5]:
target_names = ['legitimate','misinformation','irrelevant']

In [6]:
stop = stopwords.words('english')

In [7]:
def clean_text(row):
    # Lower case
    row = row.lower()
    
    # Remove URLs
    row = re.sub('http\S+|www.\S+', '', row)
    
    # Remove @mentions
    row = re.sub('@[A-Za-z0-9]+', '', row)
    
    # Remove non-standard characters
    row = row.encode("ascii", "ignore").decode()
    
    # Remove punctuation
    row = row.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stop words
    pat = r'\b(?:{})\b'.format('|'.join(stop))
    row = row.replace(pat, '')
    row = row.replace(r'\s+', ' ')
    
    # Remove extraneous whitespace
    row = row.strip()
    
    # Lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    w_tokenization = nltk.word_tokenize(row)
    final = ""
    for w in w_tokenization:
        final = final + " " + wordnet_lemmatizer.lemmatize(w)
    
    return final

In [8]:
df = pd.DataFrame(df_txt)
clean_txt = df_txt.copy().apply(clean_text)
df['clean_text'] = clean_txt

In [9]:
df.tail()

Unnamed: 0,text,clean_text
31439,This report focuses on Canada's psychological ...,this report focus on canada psychological man...
31440,"@Shane_BSer @ongreenthings @ajamubaraka Right,...",bser right vaccination alone isnt an actual s...
31441,"@CDCgov you can just say Women. y'know, since ...",you can just say woman yknow since no one els...
31442,@DailyBlessFarm @S_S_Daisy @tatereeves @newway...,sdaisy you got that right i jumped the gun an...
31443,@ClayTravis What we know is the vaccine does n...,what we know is the vaccine doe not actually ...


In [10]:
PATH = 'fifth_miscov19-covid-twitter-bert-v2'
tokenizer = AutoTokenizer.from_pretrained(PATH, local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(PATH, num_labels=len(target_names), local_files_only=True).to("cuda")

In [11]:
def get_prediction(text):
    # apply preprocessing to text
    inputs = clean_text(text)
    # prepare our text into tokenized sequence
    inputs = tokenizer(inputs, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

In [17]:
small = df.iloc[::15,:]
len(small)

2097

In [18]:
small['COBERT_classification'] = df['clean_text'].copy().apply(get_prediction)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small['COBERT_classification'] = df['clean_text'].copy().apply(get_prediction)


In [19]:
small.to_csv("classified_2097.csv")