# Social Media Analysis - Airline Reviews Sentiment Analysis

# Import Packages and Loading Data 

In [2]:
import pandas as pd
import numpy as np
with open('train.csv', encoding="utf8", errors='ignore') as f:
    train = pd.read_csv(f)
with open('test.csv', encoding="utf8", errors='ignore') as f:
    test = pd.read_csv(f)
    
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data Preprocessing

1. Setting training data and validation data:
    - random_seed = 42
    - val_size = 0.2
2. Preprocessing training data
    - tokenization
    - ensuring data with same dimention (padding)
3. Preprocessing testing data in a same way

In [9]:
# 1
x_train, x_test , y_train, y_test = train_test_split(train['content'], train['category'] , test_size = 0.20)

x_train, x_rem, y_train, y_rem = train_test_split(train['content'], train['category'], train_size = 0.8)
x_val, x_test, y_val, y_test = train_test_split(x_rem, y_rem, test_size = 0.5)

# map label into (0, 1, 2)
y_train = np.array( [label + 1 for label in y_train])
y_test = np.array([label + 1 for label in y_test])
y_val = np.array([label + 1 for label in y_val])

print(x_train.shape), print(y_train.shape)
print(x_val.shape), print(y_val.shape)
print(x_test.shape), print(y_test.shape)

(25488,)
(25488,)
(3186,)
(3186,)
(3186,)
(3186,)


(None, None)

In [10]:
# 2 
# tokenization
vocab_size = 10000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_token)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index
x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_test_sequences = tokenizer.texts_to_sequences(x_test)
x_pad_sequences = tokenizer.texts_to_sequences(x_val)


# Padding
max_length = 200 

padding_type = 'post'
truncation_type = 'post'

x_test_padded = pad_sequences(x_test_sequences, maxlen = max_length, padding = padding_type, truncating = truncation_type)
x_val_padded = pad_sequences(x_pad_sequences, maxlen = max_length, padding = padding_type, truncating = truncation_type)
x_train_padded = pad_sequences(x_train_sequences, maxlen = max_length, padding = padding_type, truncating = truncation_type)

In [11]:
# 3 Test data
test_dat = test['content']
test_dat_seq = tokenizer.texts_to_sequences(test_dat)
test_dat_padded = pad_sequences(test_dat_seq, maxlen = max_length, 
                               padding = padding_type, truncating = truncation_type)

# Using Pre-trained Embeddings (GloVe)

In [12]:
import os
embeddings_index = {}
f = open(os.path.join('glove.6B.txt/glove.6B.200d.txt'), encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

In [13]:
embedding_matrix = np.zeros((len(word_index) + 1, max_length))

zeroCnt = 0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        zeroCnt += 1

# 沒有 pretrained embedding 者佔 1/3
print('Words without pre-trained embedding counts', int(zeroCnt/len(word_index)*100), '%')

Words without pre-trained embedding counts 24 %


# Building Model

1. Setting embedding layer

In [14]:
from tensorflow.keras.layers import Embedding, LSTM, Dense

embedding_layer = Embedding(input_dim = len(word_index) + 1,
                            output_dim = max_length,
                            weights = [embedding_matrix],
                            input_length = max_length,
                            trainable = False)

2. Model Structure

In [15]:
from tensorflow.keras.models import Sequential
model = Sequential([
    embedding_layer,
    LSTM(150, dropout = 0.2),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(3, activation = 'softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 200)          6303800   
_________________________________________________________________
lstm (LSTM)                  (None, 150)               210600    
_________________________________________________________________
dense (Dense)                (None, 128)               19328     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 195       
Total params: 6,542,179
Trainable params: 238,379
Non-trainable params: 6,303,800
_________________________________________________________________


3. Compiling model

In [17]:
import tensorflow as tf
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), 
              optimizer = 'adam', metrics = ['accuracy'])

4. Training

In [18]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
%load_ext tensorboard

log_folder = 'logs'
callbacks = [EarlyStopping(patience = 10), TensorBoard(log_dir = log_folder)]
num_epochs = 7
history = model.fit(x_train_padded, y_train, epochs = num_epochs, 
                    validation_data = (x_val_padded, y_val), callbacks = callbacks)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


5. Testing Accuracy

In [20]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print('Test accuracy :', accuracy)

Test accuracy : 0.7611424922943115


# Prediction

In [21]:
probability_model = tf.keras.Sequential([model, tf.keras.layers.Softmax()])
predictions = probability_model.predict(test_dat_padded)

In [22]:
test_dat_pred = []
for yhat in predictions:
    test_dat_pred.append(np.argmax(yhat)-1)

In [31]:
pd.DataFrame(test_dat_pred).to_csv('result.csv', header = False, index = False)