# MODEL

## import libraries

In [1]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud # for wordcloud
import matplotlib.pyplot as plt # for wordcloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

## data processing

In [29]:
# import a more balance train data
train_data = pd.read_csv('updated_more_balanced_data.csv') # obtain balance data from the first 600 data manually
test_data = pd.read_csv('first_600_manually_add_theme.csv')

test_x = test_data["summary"]
test_y = test_data["theme"]

train_x = train_data["summary"]
train_y = train_data["theme"]

categories = ["corporate and business topics", 
              "labor and employment issues", 
              "privacy, security, and cyber matters", 
              "legal and crime stories", 
              "government actions and regulations", 
              "technology and digital trends", 
              "environment and climate topics", 
              "social issues and activism", 
              "healthcare and medicine", 
              "community and cultural events", 
              "international relations and trade", 
              "education and learning", 
              "consumer topics", 
              "infrastructure and development", 
              "energy and resources", 
              "political topics and protests", 
              "media and communication", 
              "financial policies and taxation", 
              "human rights and social justice", 
              "science, research, and innovation", 
              "disaster and crisis management", 
              "organized crime and trafficking", 
              "sports, entertainment, and leisure", 
              "other", 
              "military"]

y_encoded = []
for each_theme in train_y:
    each_row = []
    for category in categories:
        if category in each_theme.lower():
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

# convert to dataframe
y_encoded = pd.DataFrame(y_encoded, columns = categories)
train_y = y_encoded

y_encoded = []
for each_theme in test_y:
    each_row = []
    for category in categories:
        if category in each_theme.lower():
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

y_encoded = pd.DataFrame(y_encoded, columns = categories)
test_y = y_encoded   

## TFIDF (ngram range test accuracy to tune the preprocessing), countvectorizer (ngram range test accuracy to tune the preprocessing), word2vec, bert (not necessary)

### TDIDF

In [159]:
# word pattern
pattern = r"[a-zA-Z]+"

vectorizer = TfidfVectorizer(
    token_pattern = pattern, 
    stop_words = 'english', 
    ngram_range = (1, 2), # need to do the hyperparameter tuning for this later
    max_features = 100000 # can adjust to larger num too
)

vectorizer.fit(test_x)


# tokenization - create matrix
train_tokenized_features = vectorizer.transform(train_data["summary"])
test_tokenized_features = vectorizer.transform(test_data["summary"])


# create a dataframe
train_features = pd.DataFrame(
    data = train_tokenized_features.toarray(), 
    columns = vectorizer.get_feature_names_out()
)
test_features = pd.DataFrame(
    data = test_tokenized_features.toarray(), 
    columns = vectorizer.get_feature_names_out()
)


# print(len(vectorizer.get_feature_names_out()))

train_data["char_count"] = train_data["summary"].str.count(r"\S")
train_data["word_count"] = train_data["summary"].str.count(pattern)
train_data["avg_word_length"] = train_data["char_count"] / train_data["word_count"]

train_x = pd.concat([train_features, train_data.loc[:, "char_count": ]], axis = 1)
# data_y = data["theme"]

test_data["char_count"] = test_data["summary"].str.count(r"\S")
test_data["word_count"] = test_data["summary"].str.count(pattern)
test_data["avg_word_length"] = test_data["char_count"] / test_data["word_count"]

test_x = pd.concat([test_features, test_data.loc[:, "char_count": ]], axis = 1)


### Word2Vec - word embeddings (final)

In [30]:
# tokenize all the sentences
tokenized_sentences = [word_tokenize(each_line[0].lower()) for each_line in train_data["summary"]]

# train word2vec model
word2vec_model = Word2Vec(
    sentences = tokenized_sentences, 
    vector_size = 1000, 
    window = 5, 
    min_count = 1, 
    workers = 4
)

# generate document vectors
def vectorize_doc(each_line):
    # remove out of vocab words
    words = [word for word in each_line if word in word2vec_model.wv]
    return np.mean(word2vec_model.wv[words], axis = 0) if words else np.zeros(word2vec_model.vector_size)

# create feature vectors 
train_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in train_data["summary"]])
test_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in test_data["summary"]])
# train_y = np.array([each_line.lower() for each_line in train_data["theme"]])

## logistic regression (softmax regression), support vector machine, bert, decision trees, random forest classifier, gradient boosting algorithm, KNN, neural networks (not necessary for small datasets), naive bayes classifier

### train test split

In [29]:
train_x, test_x, train_y, test_y = train_test_split(
    data_x, 
    y_encoded, 
    test_size = 0.6, 
    stratify = None, 
    random_state = 59
)

In [56]:
from sklearn.model_selection import StratifiedKFold

s = StratifiedKFold(n_splits = 2, 
                    shuffle = True, 
                    random_state = 59)

for train_i, test_i in s.split(data_x, y_encoded.to_numpy().argmax(axis = 1)):
    train_x, test_x = data_x.iloc[train_i], data_x.iloc[test_i]
    train_y, test_y = y_encoded.iloc[train_i], y_encoded.iloc[test_i]


print(len(train_x))
print(len(train_y))



754
754


### logistic regression one-vs-all OvA (softmax regression)

In [6]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

model = LogisticRegression(
    max_iter = 200, 
    random_state = 59
)
model = MultiOutputClassifier(model)

model.fit(train_x, train_y)
model.predict(test_x)

pred = model.predict(test_x)

# print("Accuracy: ", accuracy_score(test_y, pred))
# print("Hamming Loss: ", hamming_loss(test_y, pred))

In [7]:
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(LogisticRegression())
model.fit(train_x, train_y)

In [8]:
prediction = model.predict(test_x)
pred = (prediction > 0.001).astype(int)
pred_theme = [[categories[i] for i in range(len(categories)) if pred[j, i] == 1] for j in range(len(pred))]

# print(pred_theme)

# print("F1 Score: ", f1_score(test_y, pred, average = "micro"))
# print("Accuracy: ", accuracy_score(test_y, pred))
# print("Hamming Loss: ", hamming_loss(test_y, pred))

### sigmoid activation function

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam

#### train model

In [11]:
vocab_size = 100000
num_of_categories = len(categories)

model = Sequential([
    Embedding(input_dim = vocab_size, 
              output_dim = num_of_categories, 
              input_length = 1000
              ),
    GlobalAveragePooling1D(), 
    Dense(64, activation = "relu"), # hidden layer
    Dense(num_of_categories, activation = "sigmoid")
])

model.compile(optimizer = Adam(), 
              loss = "binary_crossentropy", 
              metrics = ["accuracy"])

model.summary()



#### fit the model

In [12]:
model.fit(train_x, 
          train_y, 
          epochs = 10, 
          batch_size = 60, 
          validation_split = 0.1)

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 132ms/step - accuracy: 0.0250 - loss: 0.6887 - val_accuracy: 0.0000e+00 - val_loss: 0.6518
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.0188 - loss: 0.6320 - val_accuracy: 0.0000e+00 - val_loss: 0.6029
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0250 - loss: 0.5751 - val_accuracy: 0.0000e+00 - val_loss: 0.5467
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.0167 - loss: 0.5154 - val_accuracy: 0.0000e+00 - val_loss: 0.4884
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.0188 - loss: 0.4569 - val_accuracy: 0.0000e+00 - val_loss: 0.4301
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0208 - loss: 0.3966 - val_accuracy: 0.0000e+00 - val_loss: 0.3766
Epoch 7/10
[1m3/3[0

<keras.src.callbacks.history.History at 0x1bc2ebcc3b0>

#### predict using the model (test_x)

In [13]:
print(test_x)

[[ 2.57017731e-04  8.46529001e-05 -2.53944047e-04 ... -7.62878160e-04
  -5.78860054e-04  1.23417733e-04]
 [ 7.08879728e-04 -1.56793001e-04  7.94749882e-04 ...  7.31326814e-04
   5.46556839e-04  9.24946216e-04]
 [ 2.57017731e-04  8.46529001e-05 -2.53944047e-04 ... -7.62878160e-04
  -5.78860054e-04  1.23417733e-04]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 2.57017731e-04  8.46529001e-05 -2.53944047e-04 ... -7.62878160e-04
  -5.78860054e-04  1.23417733e-04]]


In [14]:
prediction = model.predict(test_x)
print(prediction)
pred = (prediction > 0.3).astype(int)
pred_theme = [[categories[i] for i in range(len(categories)) if pred[j, i] == 1] for j in range(len(pred))]

print(pred_theme)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[[0.09232039 0.06424018 0.02829528 ... 0.00405446 0.00702086 0.11146314]
 [0.09232047 0.06424074 0.02829556 ... 0.00405443 0.00702092 0.11146309]
 [0.09232031 0.06424062 0.0282955  ... 0.00405445 0.00702093 0.11146328]
 ...
 [0.09232047 0.06424074 0.02829556 ... 0.00405443 0.00702092 0.11146309]
 [0.09232031 0.06424062 0.0282955  ... 0.00405445 0.00702093 0.11146328]
 [0.09232031 0.06424057 0.02829551 ... 0.00405446 0.00702094 0.11146332]]
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []

### Random forest classifier

In [35]:
# binary relevance
# quite simple to understand, but it will ignore dependencies between labels 

from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators = 100, 
                               random_state = 59)
model = MultiOutputClassifier(model)

model.fit(train_x, train_y)

pred = model.predict(test_x)

print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

Accuracy:  0.011666666666666667
Hamming Loss:  0.0438


In [37]:
# label powerset
# it can capture label dependencies but if too many label combinations
# it will lead to poor performance

from skmultilearn.problem_transform import LabelPowerset

model = LabelPowerset(RandomForestClassifier(n_estimators = 100, 
                                             random_state = 59))

model.fit(train_x, train_y)
pred = model.predict(test_x)

print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

Accuracy:  0.03666666666666667
Hamming Loss:  0.0816


In [33]:
prediction = model.predict(test_x)

pred = prediction.toarray()
# print(pred)
decoded = []
for each_row in pred:
    pred_theme = [categories[i] for i, val in enumerate(each_row) if val == 1]
    decoded.append(pred_theme)

# print(decoded)

for i in decoded:
    if len(i) > 1:
        print(i)

['corporate and business topics', 'technology and digital trends']
['government actions and regulations', 'international relations and trade', 'infrastructure and development']
['government actions and regulations', 'international relations and trade', 'infrastructure and development']
['legal and crime stories', 'military']
['government actions and regulations', 'international relations and trade', 'infrastructure and development']
['corporate and business topics', 'technology and digital trends']
['government actions and regulations', 'international relations and trade', 'infrastructure and development']
['government actions and regulations', 'international relations and trade', 'infrastructure and development']
['government actions and regulations', 'international relations and trade', 'infrastructure and development']
['government actions and regulations', 'international relations and trade', 'infrastructure and development']
['government actions and regulations', 'international re

### Support Vector Machine

In [20]:
# one vs rest

from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier

model = SVC(kernel = 'rbf', # can change to 'rbf'
          probability = True)
model = MultiOutputClassifier(model)

model.fit(train_x, train_y)

pred = model.predict(test_x)
print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

Accuracy:  0.001326259946949602
Hamming Loss:  0.04151193633952255


In [21]:
# label powerset

model = SVC(kernel = 'rbf', # can change to 'rbf'
            )
model = LabelPowerset(model)

model.fit(train_x, train_y)

pred = model.predict(test_x)
print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

Accuracy:  0.013262599469496022
Hamming Loss:  0.09657824933687002


### Evaluation of model

In [36]:
from sklearn.metrics import classification_report
print(classification_report(
    test_y, 
    pred, 
    target_names = categories
))

# Hamming Loss (the lower the better)
from sklearn.metrics import hamming_loss
hl = hamming_loss(test_y, pred)
print("Hamming Loss: ", hl)

# F1 score (the higher the better)
from sklearn.metrics import f1_score
f1 = f1_score(
    test_y, 
    pred, 
    average = "macro"
)
print("F1 Score: ", f1)

# Jaccard Score (the higher the better)
from sklearn.metrics import jaccard_score 
jaccard = jaccard_score(
    test_y, 
    pred, 
    average = "samples"
)
print("Jaccard Score: ", jaccard)

print("Accuracy: ", accuracy_score(test_y, pred))

                                      precision    recall  f1-score   support

       corporate and business topics       1.00      0.02      0.04        51
         labor and employment issues       0.00      0.00      0.00        24
privacy, security, and cyber matters       0.00      0.00      0.00        16
             legal and crime stories       1.00      0.01      0.03        69
  government actions and regulations       0.00      0.00      0.00        52
       technology and digital trends       1.00      0.03      0.06        34
      environment and climate topics       0.00      0.00      0.00         7
          social issues and activism       0.00      0.00      0.00        27
             healthcare and medicine       0.00      0.00      0.00        58
       community and cultural events       0.00      0.00      0.00        22
   international relations and trade       0.00      0.00      0.00       100
              education and learning       0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
