# Sentiment-Analyse Twitter

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

from google.colab import drive

In [2]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
tf.__version__

'2.4.0'

## Preprocessing

In [3]:
# mount google drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
!ls "drive/MyDrive/Datasets/NLP/sentiment140_academics"

testdata.csv  training.csv


### load in the data via pandas

In [5]:
# load data
cols = ["label", "id", "date", "query", "user", "text"]

# train data
train_data = pd.read_csv(
    "drive/MyDrive/Datasets/NLP/sentiment140_academics/training.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
    )

In [6]:
train_data.head(3)

Unnamed: 0,label,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


In [7]:
# test data
test_data = pd.read_csv(
    "drive/MyDrive/Datasets/NLP/sentiment140_academics/testdata.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
    )

In [8]:
test_data.head(3)

Unnamed: 0,label,id,date,query,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."


### cleaning

In [9]:
# drop not needed columns
train_data.drop(
    ["id", "date", "query", "user"],
    axis=1,
    inplace=True)

train_data.columns

Index(['label', 'text'], dtype='object')

In [10]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # RegExr
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    tweet = re.sub(r" +", ' ', tweet) # get rid of white spaces in a row
    return tweet

In [11]:
train_data_clean = [clean_tweet(tweet) for tweet in train_data.text]

In [12]:
# get labels
train_data_labels = train_data.label.values
set(train_data_labels)

{0, 4}

In [13]:
# replace label 4 with 1
train_data_labels[train_data_labels == 4] = 1
set(train_data_labels)

{0, 1}

### tokenization

In [14]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    train_data_clean, target_vocab_size=2**16
)

data_inputs = [tokenizer.encode(sentence) for sentence in train_data_clean]

### padding

In [20]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

### splitting data into train and test data

In [21]:
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000))

In [23]:
test_inputs = data_inputs[test_idx]
test_labels = train_data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(train_data_labels, test_idx)

### build model

In [36]:
class DCNN(tf.keras.Model):
    def __init__(self,
                 vocab_size, 
                 emb_dim=128, 
                 nb_filters=50, 
                 FFN_units=512, 
                 nb_classes=2, 
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        self.embedding = layers.Embedding(vocab_size, 
                                        emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters, 
                                kernel_size=2, 
                                    padding="valid",
                                    activation="relu")
        self.pool_1 = layers.GlobalMaxPool1D()
        self.trigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=3,
                                    padding="valid",
                                    activation="relu")
        self.pool_2 = layers.GlobalMaxPool1D()
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=4,
                                    padding="valid",
                                    activation="relu")
        self.pool_3 = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units,
                                    activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                        activation="sigmoid")
        else: 
            self.last_dense = layers.Dense(units=nb_classes,
                                        activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool_1(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool_2(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool_3(x_3)
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # batch_size * 3 * nb_filters
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)

        return output


In [26]:
# config
VOCAB_SIZE = tokenizer.vocab_size
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))
DROPOUT_RATE = 0.2
BATCH_SIZE = 32
NB_EPOCHS = 5

### training

In [37]:
dcnn = DCNN(vocab_size=VOCAB_SIZE, 
            emb_dim=EMB_DIM, 
            nb_filters=NB_FILTERS, 
            FFN_units=FFN_UNITS, 
            nb_classes=NB_CLASSES, 
            dropout_rate=DROPOUT_RATE)

In [47]:
if NB_CLASSES == 2:
    dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [42]:
!ls "drive/MyDrive/Datasets/NLP/sentiment140_academics"

testdata.csv  training.csv


In [45]:
checkpoint_path = "drive/MyDrive/Datasets/NLP/sentiment140_academics"
ckpt = tf.train.Checkpoint(dcnn=dcnn)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Lastest checkpoint restored!")

In [None]:
dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)
ckpt_manager.save()

Epoch 1/5
  187/49503 [..............................] - ETA: 1:32:50 - loss: 0.6615 - accuracy: 0.5672