In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#pre processing
import regex as re
import string
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import Model

#
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel, TFAutoModelForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [12]:
#set style for plots
sns.set_style('white')
sns.despine()
#plt.style.use('seaborn-whitegrid')
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

<Figure size 640x480 with 0 Axes>

In [13]:
df= pd.read_csv('../dataset/final.csv')
df.head()

Unnamed: 0,sentiment,tweet
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2,0.0,My cat only chews @apple cords. Such an #Apple...
3,0.0,I agree with @jimcramer that the #IndividualIn...
4,0.0,Nobody expects the Spanish Inquisition #AAPL


In [14]:
df.iloc[0].tweet

'#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx'

### Deep Data Cleaning

In [15]:
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [16]:
texts_new = []
for t in df.tweet:
    texts_new.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(t)))))

In [17]:
df['clean_text']=texts_new
df.head()

Unnamed: 0,sentiment,tweet,clean_text
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...,aaplthe 10 best steve jobs emails ever
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,rt why aapl stock had a miniflash crash today ...
2,0.0,My cat only chews @apple cords. Such an #Apple...,my cat only chews cords such an applesnob
3,0.0,I agree with @jimcramer that the #IndividualIn...,i agree with that the individualinvestor shoul...
4,0.0,Nobody expects the Spanish Inquisition #AAPL,nobody expects the spanish inquisition aapl


In [18]:
text_len = []
for text in df.clean_text:
    tweet_len = len(text.split())
    text_len.append(tweet_len)

In [19]:
df['text_len']=text_len
df[['sentiment', 'clean_text']].to_csv('../dataset/data.csv', index=False)

In [20]:
df.iloc[0].clean_text

'aaplthe 10 best steve jobs emails ever'

In [21]:
df.head()

Unnamed: 0,sentiment,tweet,clean_text,text_len
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...,aaplthe 10 best steve jobs emails ever,7
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,rt why aapl stock had a miniflash crash today ...,11
2,0.0,My cat only chews @apple cords. Such an #Apple...,my cat only chews cords such an applesnob,8
3,0.0,I agree with @jimcramer that the #IndividualIn...,i agree with that the individualinvestor shoul...,21
4,0.0,Nobody expects the Spanish Inquisition #AAPL,nobody expects the spanish inquisition aapl,6


### Balancing the dataset

In [22]:
df['sentiment'].value_counts()

sentiment
 0.0    3676
-1.0    2235
 1.0     704
Name: count, dtype: int64

In [23]:
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df['clean_text']).reshape(-1, 1), np.array(df['sentiment']).reshape(-1, 1))
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['clean_text', 'sentiment'])
train_os['sentiment'].value_counts()

sentiment
 0.0    3676
 1.0    3676
-1.0    3676
Name: count, dtype: int64

In [24]:
train_os.head()


Unnamed: 0,clean_text,sentiment
0,aaplthe 10 best steve jobs emails ever,0.0
1,rt why aapl stock had a miniflash crash today ...,0.0
2,my cat only chews cords such an applesnob,0.0
3,i agree with that the individualinvestor shoul...,0.0
4,nobody expects the spanish inquisition aapl,0.0


### train-validation dataset split

In [145]:
from datasets import load_dataset

dataset = load_dataset("csv", split='train', data_files="../dataset/data.csv")

X = train_os['clean_text'].values
y = train_os['sentiment'].values

In [146]:
X_train, X_valid, y_train, y_valid = train_test_split(dataset['clean_text'], dataset['sentiment'], test_size=0.1, stratify=y, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [6615, 11028]

### Tokenizing

In [27]:
tokenizer= RobertaTokenizerFast.from_pretrained("roberta-base")

In [151]:
def tokenize(txt):
    token = tokenizer.encode(txt, max_length=512, truncation=True, padding=True, return_tensors='tf')
    return token

train_encoding = dataset['clean_text'].map(tokenize)


AttributeError: 'list' object has no attribute 'map'

### RoBERTa Sentiment Analysis

### Building Model

In [109]:
roberta_model = TFAutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## PPLX rocks


In [131]:

roberta_model.build(X_train.shape)

help(roberta_model)

# Freeze pre-trained layers
for layer in roberta_model.layers:
    layer.trainable = False


# Add new layers for your specific task

# print(model.summary())
# model.build(X_train.shape)

# print(model.summary())

# print(roberta_model.summary())

# Compile the model
# model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train the model
# model.fit(X_train, epochs=10, validation_data=X_valid)


Help on TFRobertaForSequenceClassification in module transformers.models.roberta.modeling_tf_roberta object:

class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, transformers.modeling_tf_utils.TFSequenceClassificationLoss)
 |  TFRobertaForSequenceClassification(config, *inputs, **kwargs)
 |  
 |  RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
 |  pooled output) e.g. for GLUE tasks.
 |  
 |  
 |  This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
 |  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
 |  etc.)
 |  
 |  This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
 |  as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
 |  behavior.
 |  
 |  <Tip>
 |  
 |  Tenso