In [4]:
import re
import nltk
import random
import numpy as np
import pandas as pd
import seaborn as sn
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model

data = pd.read_csv("./FA-KES-Dataset.csv", encoding='latin1')
print(data.head())

      unit_id                                      article_title  \
0  1914947530  Syria attack symptoms consistent with nerve ag...   
1  1914947532  Homs governor says U.S. attack caused deaths b...   
2  1914947533    Death toll from Aleppo bomb attack at least 112   
3  1914947534        Aleppo bomb blast kills six Syrian state TV   
4  1914947535  29 Syria Rebels Dead in Fighting for Key Alepp...   

                                     article_content source       date  \
0  Wed 05 Apr 2017 Syria attack symptoms consiste...    nna   4/5/2017   
1  Fri 07 Apr 2017 at 0914 Homs governor says U.S...    nna   4/7/2017   
2  Sun 16 Apr 2017 Death toll from Aleppo bomb at...    nna  4/16/2017   
3  Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...    nna  4/19/2017   
4  Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...    nna  7/10/2016   

  location  labels  
0    idlib       0  
1     homs       0  
2   aleppo       0  
3   aleppo       0  
4   aleppo       0  


In [5]:
print(data.isnull().sum())
X = data.drop(columns=['labels'])
Y = data['labels']

unit_id            0
article_title      0
article_content    0
source             0
date               0
location           0
labels             0
dtype: int64


In [6]:
# Define the vocabulary size for tokenization.
voc_size = 5000

# Make a copy of the dataset to avoid modifying the original DataFrame.
messages = X.copy()

# Reset index for clean sequential indexing.
messages.reset_index(inplace=True)

##########################################################################
# TODO: Download the NLTK stopwords resource needed for preprocessing.
##########################################################################
# Replace the "pass" statement with your code
nltk.download('stopwords')
##########################################################################
#                           END OF YOUR CODE                             #
##########################################################################


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\busin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
ps = PorterStemmer()

##########################################################################
# TODO: Preprocess each text entry in `messages['article_title']` and
# construct a cleaned list of strings stored in a variable called `corpus`.
#
# Your preprocessing steps should include:
#   1. Removing all non-alphabetic characters (hint: use re.sub)
#   2. Converting text to lowercase
#   3. Splitting text into tokens
#   4. Removing English stopwords
#   5. Applying Porter stemming to each remaining token
#   6. Joining tokens back into a single string and appending to `corpus`
##########################################################################
# Replace the "pass" statement with your code
# ایجاد شیء ریشه‌یاب
corpus = []

# فرآیند تمیزکاری متون (پیش‌فرض بر روی ستون article_title یا محتوای ترکیبی)
for i in range(0, len(messages)):
    # ۱. حذف تمامی کاراکترهای غیر از حروف الفبا
    review = re.sub('[^a-zA-Z]', ' ', messages['article_title'][i])
    # ۲. کوچک‌سازی حروف
    review = review.lower()
    review = review.split()
    
    # ۳. حذف Stopwords و انجام Stemming
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
##########################################################################
#                           END OF YOUR CODE                             #
##########################################################################

# Display a random sample of cleaned text entries
for i in random.sample(corpus, 10):
    print(i)


civilian kill terrorist rocket attack aleppo idleb
least kill syrian weapon depot blast
russian aerospac forc destroy isi oil refin station kill terrorist past week
hizballah lead regim offens southern syria
dead strike rebel held hospit syria
shell rebel bastion near syrian capit kill
kill injur terrorist rocket attack aleppo daraa
kurdish led forc elimin daesh milit near al tabqah northern syria
russia terrorist breach cessat hostil time hour breach aleppo
russia say belgian strike kill six aleppo region


***WORD EMBEDDING***

In [8]:
##########################################################################
# TODO: Convert each preprocessed text in `corpus` into a one-hot encoded
# representation using the `one_hot` function and store the result in a
# variable named **onehot_repr**.
##########################################################################
# Replace the "pass" statement with your code
onehot_repr = [one_hot(words, voc_size) for words in corpus]
##########################################################################
#                           END OF YOUR CODE                             #
##########################################################################

# Display a random sample of one-hot encoded sequences
for i in random.sample(onehot_repr, 10):
    print(i)


[4077, 2347, 4168, 1035, 1933, 3003, 1226, 2285]
[4287, 1035, 4269, 3679, 533, 271, 847]
[2707, 4168, 4269, 1226, 1035, 2782, 4915, 4077]
[2798, 247, 1035, 3201, 4077, 4915, 1554, 4951, 2343]
[4269, 3679, 1035, 517, 556, 1410, 4287, 2105, 3245, 2963]
[4794, 824, 4077, 1861, 1784, 3112, 4575]
[770, 1783, 1035, 4492, 1208, 4406, 3343, 4077, 2347, 4168]
[770, 4915, 4168, 1035, 2072, 4077, 2285]
[415, 615, 3980, 4168, 3696, 1035, 4730]
[1940, 1861, 1035, 1153, 4287, 3204, 3696]


In [9]:
sent_length = 40

##########################################################################
# TODO: Pad all one-hot encoded sequences in `onehot_repr` to a fixed
# length of `sent_length` using `pad_sequences`. Store the result in a
# variable named **embedded_docs**.
##########################################################################
# Replace the "pass" statement with your code
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
##########################################################################
#                           END OF YOUR CODE                             #
##########################################################################

print(embedded_docs.shape)
print(embedded_docs[0])


(804, 40)
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 4077  829 4550 1314 1765  666 3474]


***DEFINING BOTH MODELS***


*   RNN
*   CNN+RNN



In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense
from tensorflow.keras.optimizers import Adam

embedding_vector_features = 100

##########################################################################
# TODO: Build a hybrid CNN + LSTM model
##########################################################################
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

# کامپایل مدل با بهینه‌ساز Adam (نرخ یادگیری معمولا 0.001 یا طبق مقاله)
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
##########################################################################
#                           END OF YOUR CODE                             #
##########################################################################

print(model.summary())

##########################################################################
# TODO: Build a pure RNN model (using LSTM layer)
##########################################################################
model_RNN = Sequential()
model_RNN.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model_RNN.add(LSTM(32))
model_RNN.add(Dense(1, activation='sigmoid'))

model_RNN.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
##########################################################################
#                           END OF YOUR CODE                             #
##########################################################################

print(model_RNN.summary())



None


None


***TRAIN TEST SPLITTING***

In [11]:
print(embedded_docs.shape)
print(Y.shape)

X_final = np.array(embedded_docs)
Y_final = np.array(Y)
X_final.shape , Y_final.shape

x_train, x_test, y_train, y_test = train_test_split(X_final, Y_final, test_size=0.33, random_state=42)

(804, 40)
(804,)


***HYBRID CNN+RNN TRAINING***

In [None]:
filepath = "my_best_model.keras"   # modern Keras format

# Callback to save the model that achieves the best validation accuracy
checkpoint = ModelCheckpoint(
    filepath=filepath,
    monitor="val_accuracy",
    verbose=1,
    save_best_only=True,
    mode="max"
)

callbacks = [checkpoint]

##########################################################################
# TODO: Train the model using the appropriate hyperparameters based on 
# the paper you are following. Use:
#   - (x_train, y_train) as training data
#   - (x_test, y_test) as validation data
#   - the callback list defined above
#
# Store the training output in a variable named **history**.
#
# NOTE: You must determine the correct training hyperparameters (epochs,
# batch size, optimizer settings, etc.) from the referenced paper.
##########################################################################
# Replace the "pass" statement with your code
pass
##########################################################################
#                           END OF YOUR CODE                             #
##########################################################################


In [None]:
# -----------------------------
# Plot Accuracy
# -----------------------------
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Hybrid Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# -----------------------------
# Plot Loss
# -----------------------------
plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Hybrid Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# -----------------------------
# Load Best Saved Model
# -----------------------------
model = load_model("my_best_model.keras")

# -----------------------------
# Predict on Test Data
# -----------------------------
preds = model.predict(x_test)
Y_pred = (preds >= 0.5).astype(int).reshape(-1)

# -----------------------------
# Confusion Matrix
# -----------------------------
df_cm = confusion_matrix(y_test, Y_pred)

plt.figure(figsize=(7,5))
sn.heatmap(df_cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()

# -----------------------------
# Accuracy Score
# -----------------------------
print("Accuracy:", accuracy_score(y_test, Y_pred) * 100)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, Y_pred, zero_division=0))

***RNN TRAINING***

In [None]:
filepath = "my_best_model_RNN.keras"   # modern format

# Callback to save the RNN model achieving the best validation accuracy
checkpoint = ModelCheckpoint(
    filepath=filepath,
    monitor="val_accuracy",
    verbose=1,
    save_best_only=True,
    mode="max"
)

callbacks = [checkpoint]

##########################################################################
# TODO: Train the RNN model (model_RNN) using the appropriate 
# hyperparameters based on the paper.
#
# Use:
#   - (x_train, y_train) as the training set
#   - (x_test, y_test) as the validation set
#   - the callbacks defined above
#
# Store the training output in a variable named **history_RNN**.
#
# NOTE: You must determine all hyperparameters (epochs, batch size, 
# optimizer config, etc.) from the referenced paper.
##########################################################################
# Replace the "pass" statement with your code
pass
##########################################################################
#                           END OF YOUR CODE                             #
##########################################################################


In [None]:
# ============================
#   RNN MODEL EVALUATION
# ============================

# ---- Plot Accuracy ----
plt.plot(history_RNN.history['accuracy'])
plt.plot(history_RNN.history['val_accuracy'])
plt.title('RNN Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# ---- Plot Loss ----
plt.figure()
plt.plot(history_RNN.history['loss'])
plt.plot(history_RNN.history['val_loss'])
plt.title('RNN Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# ---- Load Best Saved RNN Model ----
model = load_model('my_best_model_RNN.keras')

# ---- Predict ----
preds = model.predict(x_test)

Y_pred = (preds >= 0.5).astype(int).reshape(-1)

# ---- Confusion Matrix ----
df_cm = confusion_matrix(y_test, Y_pred)

plt.figure(figsize=(7,5))
sn.heatmap(df_cm, annot=True, fmt='d', cmap='Blues')
plt.title('RNN Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print('Accuracy:', accuracy_score(y_test, Y_pred) * 100)


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, Y_pred, zero_division=0))