# Tweets Natural Language Processing

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np

In [None]:
# allow Google file stream to access drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing the Tensorflow dataset (do not run)

https://www.tensorflow.org/datasets/catalog/sentiment140

In [None]:
splits = ['train[:70%]', 'train[70%:]']

(training_set, validation_set), dataset_info = tfds.load('sentiment140', 
                                                         split=splits, 
                                                         with_info=True, 
                                                         as_supervised=True)

[1mDownloading and preparing dataset sentiment140/1.0.0 (download: 77.59 MiB, generated: 305.13 MiB, total: 382.73 MiB) to /root/tensorflow_datasets/sentiment140/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]






0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/sentiment140/1.0.0.incomplete9A4DT8/sentiment140-train.tfrecord


  0%|          | 0/1600000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/sentiment140/1.0.0.incomplete9A4DT8/sentiment140-test.tfrecord


  0%|          | 0/498 [00:00<?, ? examples/s]

[1mDataset sentiment140 downloaded and prepared to /root/tensorflow_datasets/sentiment140/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
print(dataset_info)

tfds.core.DatasetInfo(
    name='sentiment140',
    version=1.0.0,
    description='Sentiment140 allows you to discover the sentiment of a brand, product, or topic on Twitter.

The data is a CSV with emoticons removed. Data file format has 6 fields:

0. the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1. the id of the tweet (2087)
2. the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3. the query (lyx). If there is no query, then this value is NO_QUERY.
4. the user that tweeted (robotickilldozr)
5. the text of the tweet (Lyx is cool)

For more information, refer to the paper
Twitter Sentiment Classification with Distant Supervision at
https://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf',
    homepage='http://help.sentiment140.com/home',
    features=FeaturesDict({
        'date': Text(shape=(), dtype=tf.string),
        'polarity': tf.int32,
        'query': Text(shape=(), dtype=tf.string),
        'text': Text(shape=(), dtype=tf.string),


In [None]:
# 10% of the training set
training_set_converted = tfds.as_dataframe(training_set.take(112000), dataset_info)
# 10% of the validation set
validation_set_converted = tfds.as_dataframe(validation_set.take(48000), dataset_info)
print(training_set_converted.head())
print(validation_set_converted.head())

   polarity                                               text
0         4                b"i'm 10x cooler than all of you! "
1         0  b'O.kk? Thats weird I cant stop following peop...
2         4  b'what a beautiful day not to got to my first ...
3         4  b".@HildyGottlieb &amp; I was just saying to M...
4         0    b'kinda sad and confused  why do guys do this?'
   polarity                                               text
0         4  b'@ShiftParadigm And most of those new words p...
1         4  b'@vikusia good on you, Victoria  more people ...
2         4  b"@YoungQ An since you're asking....Hair just ...
3         4  b'My light bulb blew out so my dad fixed it! Y...
4         4  b'I love my family so much!!!!! They make me s...


In [None]:
training_set_converted.to_csv('training_set.csv')
validation_set_converted.to_csv('validation_set.csv')

# Re-importing the Tensorflow dataset (run)

In [None]:
training_set = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/training_set.csv')
validation_set = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/validation_set.csv')
training_set.head()

Unnamed: 0.1,Unnamed: 0,polarity,text
0,0,4,"b""i'm 10x cooler than all of you! """
1,1,0,b'O.kk? Thats weird I cant stop following peop...
2,2,4,b'what a beautiful day not to got to my first ...
3,3,4,"b"".@HildyGottlieb &amp; I was just saying to M..."
4,4,0,b'kinda sad and confused why do guys do this?'


# Preprocessing the Tensorflow dataset for training

In [None]:
train = training_set['text'].tolist()
val = validation_set['text'].tolist()
train_labels = training_set['polarity'].tolist()
val_labels = validation_set['polarity'].tolist()
# changing labels from (0, 4) to (0, 1)
train_labels = np.array(train_labels)/4
val_labels = np.array(val_labels)/4

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocabulary_size = 10000
# 280 is the maximum number of words for a Tweet
tweet_length = 280

tokenizer = Tokenizer(num_words=vocabulary_size, oov_token='<OOV>')
# <OOV> is the label for words out of vocabulary
tokenizer.fit_on_texts(train)

# generating sequences of words
train_sequences = tokenizer.texts_to_sequences(train)
# paddings adds zeros at the end of Tweets with less that 280 words
train_padded = pad_sequences(train_sequences, maxlen=tweet_length, padding='post')

val_sequences = tokenizer.texts_to_sequences(validation)
val_padded = pad_sequences(val_sequences, maxlen=280, padding='post')

# Training a sentiment analysis model

In [None]:
embedding_dimension = 16
# embedding allows ro represent words in a vector space such that the closer words are similar in meaning

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabulary_size, embedding_dimension, input_length=tweet_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 280, 16)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 4480)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 26886     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 186,893
Trainable params: 186,893
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 5
# starts overfitting after 3/4 epochs

model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(val_padded, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f55abc24dd0>

# Applying the model to the Eurovision dataset

In [None]:
my_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/pinned_tweet_replies.csv')
num_tweets = my_data.shape[0]
my_data.head()

Unnamed: 0.1,Unnamed: 0,ID,Text,User ID,Username
0,0,1396417830409605122,@Eurovision @thisismaneskin y'all so mad they ...,1032734148996292609,hannucciahorta
1,1,1396417787116072963,@Eurovision Superhappy for my fav group this y...,214934153,LesPaulTK
2,2,1396417499701329922,@Eurovision @thisismaneskin We’ve got the vacc...,1227988176364589058,zack92581123
3,3,1396416365964275718,@Eurovision @thisismaneskin Its a shame bad mu...,997228507905261570,critiamigas
4,4,1396416151706644481,@Eurovision @thisismaneskin Congratulations to...,303428420,786maq


In [None]:
# data preprocessing

import regex as re

def remove_url(txt):
    return ' '.join(re.sub(r'https:\S+', '', txt).split())

def remove_hashtags(txt):
    return ' '.join(re.sub(r'#\S+', '', txt).split())

def remove_handles(txt):
    return ' '.join(re.sub(r'@\S+', '', txt).split())

tweets = my_data['Text'].to_list()
tweets = [remove_url(tweet) for tweet in tweets]
tweets = [remove_hashtags(tweet) for tweet in tweets]
tweets = [remove_handles(tweet) for tweet in tweets]
for tweet in tweets[1:10]:
    print(tweet)

Superhappy for my fav group this year! Rock n’ roll always lives on🤘🏻im sad tho ‘cause me and my gf voted, 10min after lines opened, 20x (each) for Italy+other countries including 🇪🇸 and 🇩🇪 that weren’t accepted. None of our votes were counted on time but we still paid €9 each..
We’ve got the vaccine unlucky Europe
Its a shame bad music and drugs addiction were the winners of this year. Its very disappointing after covid eurovision let people get high in front of the cameras. Not only in the show, but also when they are interviewed after winning.
Congratulations to Italy and can someone explain why Isreal the occupiers of Palistinian land is in the Eurovision? it's not even Europe???
Tongo!!! They weren't the best, awful song, bad singer. Switzerland, France or even Iceland were far more better. How sad!!!
(( How Italy was given over 300 points by the public I'll never know. Honestly the real winning competitors for me were France, Iceland, Azerbaijan or the funny dancers Lithuania. ))

In [None]:
tokenizer.fit_on_texts(tweets)
tweets_sequences = tokenizer.texts_to_sequences(tweets)
tweets_padded = pad_sequences(tweets_sequences, maxlen=tweet_length, padding='post')

predictions = model.predict(tweets_padded)

In [None]:
import random

random_indices = []
for i in range(10):
    random_indices.append(random.randint(0, num_tweets))

for i in random_indices:
    print(tweets[i])
    print(predictions[i])
    print('\n')

It’s all politics: UK getting 0 points out of pettyness for leaving the EU, old Russian countries being voted for by Russia, countries next to Russia voting for Russia out of fear, the Scandinavians banding together- I hope the UK pulls out
[0.00093159]


UK should send Bring Me The Horizon next year, come on!
[0.7451227]


This is OUTRAGEOUS, taking a line of C in front of millions of people, families and kids…they should be disqualified right away!!
[0.05906588]


Congratulations!!!!
[0.68092906]


Big up to James Newman for taking those 0 points like an absolute champ!
[0.981828]


deserved 🇮🇹🇮🇹🇮🇹🇮🇹
[0.62703013]


no more
[0.7197749]


DIVINE JUSTICE. 🇮🇹🇮🇹🇮🇹
[0.8242257]


Pathetic
[0.68974984]


Congratulations 👍🏻👏🏻
[0.7669476]


