In [None]:
# Install the required packages
!pip install --upgrade pip
!pip install transformers
!pip install hummingbird.ml
!pip install pandas
!pip install matplotlib
!pip install nltk
!pip install keras
!pip install tensorflow
!pip install numpy

In [None]:
# Chech GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# For using the Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
# from sklearn.manifold import TSNE
# from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
# import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# csv_collection = []
# for dirname, _, filenames in os.walk('/content/drive/MyDrive/Colab Notebooks/csv/2022'):
#     for filename in filenames:
#         fullpath= os.path.join(dirname, filename)
#         csv_collection.append(fullpath)

In [None]:
file_name = "2022/tweets_2022_04.csv"

In [None]:
df = pd.read_csv(file_name, index_col =0, sep='\t', lineterminator='\n', encoding = "ISO-8859-1")

In [None]:
# csv_collection

In [None]:
#  df = pd.read_csv(csv_collection.pop(), index_col=0, sep='\t', lineterminator='\n')

In [None]:
# for data in csv_collection:
#     tmp = pd.read_csv(data, index_col =0, sep='\t', lineterminator='\n')
#     df = pd.concat([df, tmp], axis=0)

In [None]:
df_en = df[df.language == 'en'].drop('language', axis=1)

In [None]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
%%time
df_en.tweet = df_en.tweet.apply(lambda x: preprocess(x))

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig, Trainer
from hummingbird.ml import convert
from transformers import TrainingArguments



#file_name = "/content/drive/MyDrive/Colab Notebooks/csv/"
# import torch
import numpy as np
from scipy.special import softmax
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}
# Preprocess text (username and link placeholders)
task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
# MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
#config = AutoConfig.from_pretrained(MODEL)
# PT


In [None]:
# Emotion Labels
import urllib.request
import csv
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [None]:
# Use Cuda with GPU
CUDA = True
# BATCH_SIZE = 32
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
if CUDA:
    model.to('cuda')

In [None]:
TrainingArguments.eval_batch_size = 8
trainer = Trainer(model=model)

In [None]:
text_column = "tweet"
pred_texts = df_en[text_column].dropna().astype('str').tolist()


In [None]:
len(pred_texts)

In [None]:
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True,max_length=128)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
# Magic
predictions = trainer.predict(pred_dataset)

In [None]:
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text','pred','label','score'])
df.head()

In [None]:
 df.to_csv("csv/2022/Sentiment_Emoji_2022_04.csv")