<a href="https://colab.research.google.com/github/ChiaoYunTing/Text-Analytics/blob/main/LSTM_News_headline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import pandas as pd
import nltk

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from sklearn. preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Load News Headline Data

In [59]:
# import pandas as pd
# Load the dataset into a Pandas DataFrame
df = pd.read_json('Sarcasm_Headlines_Dataset.json', lines = True)
df

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
...,...,...,...
26704,https://www.huffingtonpost.com/entry/american-...,american politics in moral free-fall,0
26705,https://www.huffingtonpost.com/entry/americas-...,america's best 20 hikes,0
26706,https://www.huffingtonpost.com/entry/reparatio...,reparations and obama,0
26707,https://www.huffingtonpost.com/entry/israeli-b...,israeli ban targeting boycott supporters raise...,0


## Preprocess

In [60]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values

article_link    0
headline        0
is_sarcastic    0
dtype: int64

In [61]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    return ' '.join(tokens)

In [62]:
df['headline'] = df['headline'].apply(preprocess_text)

In [51]:
df

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues secret code mi...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,revival catches thorny political mood better w...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting fear son web series closest thing...,1
3,https://politics.theonion.com/boehner-just-wan...,boehner wants wife listen come alternative ideas,1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,rowling wishes snape happy birthday magical way,0
...,...,...,...
26704,https://www.huffingtonpost.com/entry/american-...,american politics moral,0
26705,https://www.huffingtonpost.com/entry/americas-...,america best hikes,0
26706,https://www.huffingtonpost.com/entry/reparatio...,reparations obama,0
26707,https://www.huffingtonpost.com/entry/israeli-b...,israeli ban targeting boycott supporters raise...,0


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


## Transform Data to Sequence and Padding

In [63]:
#Convert Text to Sequences
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Define the maximum number of words to consider and the maximum sequence length
max_words = 10000  # This is the vocabulary size
max_len = 100      # This is the maximum length of the sequences

# Create and fit the tokenizer
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['headline'])  # Train on cleaned data
sequences = tokenizer.texts_to_sequences(df['headline'])
word_index = tokenizer.word_index  # Dictionary mapping words to an integer index

# Check how a tokenized sequence looks like
print("Example of tokenized sequence:", sequences[0])

Example of tokenized sequence: [193, 618, 3033, 2040, 265, 1938, 2305, 7650]


In [64]:
# Pad sequences
data_padded = pad_sequences(sequences, maxlen=max_len)

# Check the shape of your data
print("Shape of data tensor:", data_padded.shape)

Shape of data tensor: (26709, 100)


In [65]:
#prepare lables
import numpy as np

labels = np.asarray(data['is_sarcastic'])

# Check the shape of your labels
print("Shape of label tensor:", labels.shape)

Shape of label tensor: (26709,)


In [66]:
# Split the data
sentence_train, sentence_test, labels_train, labels_test = train_test_split(data_padded, labels, test_size=0.2, random_state=42)

# Check the shapes of the splits
print("Training data shape:", sentence_train.shape)
print("Validation data shape:", sentence_test.shape)

Training data shape: (21367, 100)
Validation data shape: (5342, 100)


## LSTM Model with Attention Machenism

In [67]:
#Configure and Implement the Attention Layer
from keras.layers import Layer
import keras.backend as K

class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])


In [80]:
#Define the Model
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping

# Define the model with attention mechanism
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=len(word_index) + 1, output_dim=64, input_length=max_len)(input_layer)
# Add dropout after embedding
dropout_emb = Dropout(0.5)(embedding_layer)
lstm_layer = LSTM(64, return_sequences=True)(embedding_layer)
attention_layer = AttentionLayer()(lstm_layer)
dropout_att = Dropout(0.5)(attention_layer)
output_layer = Dense(1, activation='sigmoid')(attention_layer)

attention_model = Model(inputs=input_layer, outputs=output_layer)

# Compile and train the model with attention mechanism
attention_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics =['accuracy'])
# normal fitting with call back
attention_model.fit(sentence_train, labels_train, epochs=10)
# Predict on the test data
predictions = attention_model.predict(sentence_test)
# Convert predictions to binary labels
threshold = 0.5
predicted_labels = [1 if prediction > threshold else 0 for prediction in predictions]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [81]:
# Calculate precision, recall, and F1 score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(labels_test, predicted_labels)
precision = precision_score(labels_test, predicted_labels)
recall = recall_score(labels_test, predicted_labels)
f1 = f1_score(labels_test, predicted_labels)
#Print metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Accuracy: 0.7871583676525645
Precision: 0.7756497948016415
Recall: 0.7250639386189258
F1 Score: 0.7495042961004627


## LSTM Model without Attention Machenism

In [85]:
from keras.layers import Input, Embedding, LSTM, Dropout, Dense, GlobalMaxPooling1D
from keras.models import Model

input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=len(word_index) + 1, output_dim=64, input_length=max_len)(input_layer)
dropout_emb = Dropout(0.5)(embedding_layer)
lstm_layer = LSTM(64, return_sequences=True)(dropout_emb)  # Maintaining return_sequences=True
dropout_lstm = Dropout(0.5)(lstm_layer)
pooling_layer = GlobalMaxPooling1D()(dropout_lstm)  # Adding a pooling layer
output_layer = Dense(1, activation='sigmoid')(pooling_layer)

lstm_model = Model(inputs=input_layer, outputs=output_layer)
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# normal fitting with call back
lstm_model.fit(sentence_train, labels_train, epochs=10)
# Predict on the test data
predictions = lstm_model.predict(sentence_test)
# Convert predictions to binary labels
threshold = 0.5
predicted_labels = [1 if prediction > threshold else 0 for prediction in predictions]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [86]:
# Calculate precision, recall, and F1 score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(labels_test, predicted_labels)
precision = precision_score(labels_test, predicted_labels)
recall = recall_score(labels_test, predicted_labels)
f1 = f1_score(labels_test, predicted_labels)
#Print metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Accuracy: 0.7931486334706103
Precision: 0.7936583057264552
Recall: 0.7148337595907929
F1 Score: 0.7521865889212829


## Test Models on Another Sample Data