# MODEL

## import libraries

In [156]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud # for wordcloud
import matplotlib.pyplot as plt # for wordcloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

## data processing

In [158]:
test_data = pd.read_csv('6_data_country_and_label.csv')
# import a more balance train data
train_data = pd.read_csv('updated_more_balanced_data.csv') # find balance data from the first 600 data

test_x = test_data["summary"]
test_y = test_data["theme"]

train_x = train_data["summary"]
train_y = train_data["theme"]

categories = ["corporate and business topics", 
              "labor and employment issues", 
              "privacy, security, and cyber matters", 
              "legal and crime stories", 
              "government actions and regulations", 
              "technology and digital trends", 
              "environment and climate topics", 
              "social issues and activism", 
              "healthcare and medicine", 
              "community and cultural events", 
              "international relations and trade", 
              "education and learning", 
              "consumer topics", 
              "infrastructure and development", 
              "energy and resources", 
              "political topics and protests", 
              "media and communication", 
              "financial policies and taxation", 
              "human rights and social justice", 
              "science, research, and innovation", 
              "disaster and crisis management", 
              "organized crime and trafficking", 
              "sports, entertainment, and leisure", 
              "other", 
              "military"]

y_encoded = []
for each_theme in train_y:
    each_row = []
    for category in categories:
        if category in each_theme.lower():
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

# convert to dataframe
y_encoded = pd.DataFrame(y_encoded, columns = categories)
train_y = y_encoded

y_encoded = []
for each_theme in test_y:
    each_row = []
    for category in categories:
        if category in each_theme.lower():
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

y_encoded = pd.DataFrame(y_encoded, columns = categories)
test_y = y_encoded   

In [35]:
rows = np.where(np.sum(y_encoded, axis = 1) > 1)[0]
# print(rows)

theme_counts = y_encoded.to_numpy().argmax(axis = 1)
print(theme_counts)
unique_classes, counts = np.unique(theme_counts, 
                                   return_counts = True)
print(counts)
print(unique_classes)
print(len(unique_classes))


[ 1 17  2 ...  3  4 17]
[136  26  28 157  83 101  34  57 151 109 166  43  31  15  13  56  25  33
  16  49  16   7  85  36  35]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
25


## TFIDF (ngram range test accuracy to tune the preprocessing), countvectorizer (ngram range test accuracy to tune the preprocessing), word2vec, bert (not necessary)

### TDIDF

In [159]:
# word pattern
pattern = r"[a-zA-Z]+"

vectorizer = TfidfVectorizer(
    token_pattern = pattern, 
    stop_words = 'english', 
    ngram_range = (1, 2), # need to do the hyperparameter tuning for this later
    max_features = 100000 # can adjust to larger num too
)

vectorizer.fit(test_x)


# tokenization - create matrix
train_tokenized_features = vectorizer.transform(train_data["summary"])
test_tokenized_features = vectorizer.transform(test_data["summary"])


# create a dataframe
train_features = pd.DataFrame(
    data = train_tokenized_features.toarray(), 
    columns = vectorizer.get_feature_names_out()
)
test_features = pd.DataFrame(
    data = test_tokenized_features.toarray(), 
    columns = vectorizer.get_feature_names_out()
)


# print(len(vectorizer.get_feature_names_out()))

train_data["char_count"] = train_data["summary"].str.count(r"\S")
train_data["word_count"] = train_data["summary"].str.count(pattern)
train_data["avg_word_length"] = train_data["char_count"] / train_data["word_count"]

train_x = pd.concat([train_features, train_data.loc[:, "char_count": ]], axis = 1)
# data_y = data["theme"]

test_data["char_count"] = test_data["summary"].str.count(r"\S")
test_data["word_count"] = test_data["summary"].str.count(pattern)
test_data["avg_word_length"] = test_data["char_count"] / test_data["word_count"]

test_x = pd.concat([test_features, test_data.loc[:, "char_count": ]], axis = 1)


### Word2Vec - word embeddings

In [93]:
# tokenize all the sentences
tokenized_sentences = [word_tokenize(each_line[0].lower()) for each_line in train_data["summary"]]

# train word2vec model
word2vec_model = Word2Vec(
    sentences = tokenized_sentences, 
    vector_size = 1000, 
    window = 5, 
    min_count = 1, 
    workers = 4
)

# generate document vectors
def vectorize_doc(each_line):
    # remove out of vocab words
    words = [word for word in each_line if word in word2vec_model.wv]
    return np.mean(word2vec_model.wv[words], axis = 0) if words else np.zeros(word2vec_model.vector_size)

# create feature vectors 
train_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in train_data["summary"]])
test_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in test_data["summary"]])
# train_y = np.array([each_line.lower() for each_line in train_data["theme"]])

## logistic regression (softmax regression), support vector machine, bert, decision trees, random forest classifier, gradient boosting algorithm, KNN, neural networks (not necessary for small datasets), naive bayes classifier

### train test split

In [29]:
train_x, test_x, train_y, test_y = train_test_split(
    data_x, 
    y_encoded, 
    test_size = 0.6, 
    stratify = None, 
    random_state = 59
)

In [56]:
from sklearn.model_selection import StratifiedKFold

s = StratifiedKFold(n_splits = 2, 
                    shuffle = True, 
                    random_state = 59)

for train_i, test_i in s.split(data_x, y_encoded.to_numpy().argmax(axis = 1)):
    train_x, test_x = data_x.iloc[train_i], data_x.iloc[test_i]
    train_y, test_y = y_encoded.iloc[train_i], y_encoded.iloc[test_i]


print(len(train_x))
print(len(train_y))



754
754


### logistic regression one-vs-all OvA (softmax regression)

In [162]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

model = LogisticRegression(
    max_iter = 200, 
    random_state = 59
)
model = MultiOutputClassifier(model)

model.fit(train_x, train_y)
model.predict(test_x)

pred = model.predict(test_x)

# print("Accuracy: ", accuracy_score(test_y, pred))
# print("Hamming Loss: ", hamming_loss(test_y, pred))

In [163]:
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(LogisticRegression())
model.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [164]:
prediction = model.predict(test_x)
pred = (prediction > 0.001).astype(int)
pred_theme = [[categories[i] for i in range(len(categories)) if pred[j, i] == 1] for j in range(len(pred))]

# print(pred_theme)

# print("F1 Score: ", f1_score(test_y, pred, average = "micro"))
# print("Accuracy: ", accuracy_score(test_y, pred))
# print("Hamming Loss: ", hamming_loss(test_y, pred))

### sigmoid activation function

In [166]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam

#### train model

In [167]:
vocab_size = 100000
num_of_categories = len(categories)

model = Sequential([
    Embedding(input_dim = vocab_size, 
              output_dim = num_of_categories, 
              input_length = 1000
              ),
    GlobalAveragePooling1D(), 
    Dense(64, activation = "relu"), # hidden layer
    Dense(num_of_categories, activation = "sigmoid")
])

model.compile(optimizer = Adam(), 
              loss = "binary_crossentropy", 
              metrics = ["accuracy"])

model.summary()



#### fit the model

In [168]:
model.fit(train_x, 
          train_y, 
          epochs = 10, 
          batch_size = 60, 
          validation_split = 0.1)

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 572ms/step - accuracy: 0.0285 - loss: 0.6727 - val_accuracy: 0.0000e+00 - val_loss: 0.5061
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 147ms/step - accuracy: 0.0382 - loss: 0.5088 - val_accuracy: 0.0000e+00 - val_loss: 0.3151
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 131ms/step - accuracy: 0.0194 - loss: 0.3505 - val_accuracy: 0.2941 - val_loss: 0.2488
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - accuracy: 0.0000e+00 - loss: 0.2873 - val_accuracy: 0.2941 - val_loss: 0.2283
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - accuracy: 0.0347 - loss: 0.2533 - val_accuracy: 0.0000e+00 - val_loss: 0.2426
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 146ms/step - accuracy: 0.0785 - loss: 0.2397 - val_accuracy: 0.0000e+00 - val_loss: 0.2643
Epoch 7/10
[1m3/3[

<keras.src.callbacks.history.History at 0x19a822d80b0>

#### predict using the model (test_x)

In [169]:
print(test_x)

      aadmi  aadmi party  aal  aal subsidiaries  aal thirdparty  aam  \
0       0.0          0.0  0.0               0.0             0.0  0.0   
1       0.0          0.0  0.0               0.0             0.0  0.0   
2       0.0          0.0  0.0               0.0             0.0  0.0   
3       0.0          0.0  0.0               0.0             0.0  0.0   
4       0.0          0.0  0.0               0.0             0.0  0.0   
...     ...          ...  ...               ...             ...  ...   
1503    0.0          0.0  0.0               0.0             0.0  0.0   
1504    0.0          0.0  0.0               0.0             0.0  0.0   
1505    0.0          0.0  0.0               0.0             0.0  0.0   
1506    0.0          0.0  0.0               0.0             0.0  0.0   
1507    0.0          0.0  0.0               0.0             0.0  0.0   

      aam aadmi  aaninin  aaninin people  aap  ...  zulema green  zulkifli  \
0           0.0      0.0             0.0  0.0  ...       

In [170]:
prediction = model.predict(test_x)
print(prediction)
pred = (prediction > 0.3).astype(int)
pred_theme = [[categories[i] for i in range(len(categories)) if pred[j, i] == 1] for j in range(len(pred))]

print(pred_theme)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step
[[0.01861435 0.01286881 0.00398956 ... 0.00291461 0.02320384 0.15563579]
 [0.01860986 0.01287851 0.00398777 ... 0.00291485 0.02320582 0.15551554]
 [0.0186118  0.01287542 0.00398934 ... 0.00291499 0.02320975 0.15556276]
 ...
 [0.01860987 0.01287854 0.00398776 ... 0.00291485 0.02320584 0.15551534]
 [0.01861181 0.0128754  0.00398934 ... 0.00291499 0.02320974 0.15556253]
 [0.01864244 0.01288586 0.00399153 ... 0.00291522 0.02319274 0.1556392 ]]
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [

### Random forest classifier

In [172]:
# binary relevance
# quite simple to understand, but it will ignore dependencies between labels 

from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators = 100, 
                               random_state = 59)
model = MultiOutputClassifier(model)

model.fit(train_x, train_y)

pred = model.predict(test_x)

print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

Accuracy:  0.10809018567639257
Hamming Loss:  0.03620689655172414


In [173]:
# label powerset
# it can capture label dependencies but if too many label combinations
# it will lead to poor performance

from skmultilearn.problem_transform import LabelPowerset

model = LabelPowerset(RandomForestClassifier(n_estimators = 100, 
                                             random_state = 59))

model.fit(train_x, train_y)
pred = model.predict(test_x)

print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))



Accuracy:  0.13660477453580902
Hamming Loss:  0.07458885941644562


In [174]:
prediction = model.predict(test_x)

pred = prediction.toarray()
# print(pred)
decoded = []
for each_row in pred:
    pred_theme = [categories[i] for i, val in enumerate(each_row) if val == 1]
    decoded.append(pred_theme)

# print(decoded)

for i in decoded:
    if len(i) > 1:
        print(i)



['legal and crime stories', 'military']
['legal and crime stories', 'military']
['legal and crime stories', 'military']
['legal and crime stories', 'military']
['legal and crime stories', 'military']
['legal and crime stories', 'military']
['legal and crime stories', 'military']
['government actions and regulations', 'international relations and trade']
['legal and crime stories', 'military']
['government actions and regulations', 'international relations and trade']
['government actions and regulations', 'international relations and trade']
['government actions and regulations', 'international relations and trade']
['legal and crime stories', 'military']
['government actions and regulations', 'international relations and trade']
['government actions and regulations', 'international relations and trade']
['government actions and regulations', 'international relations and trade']
['legal and crime stories', 'military']
['corporate and business topics', 'government actions and regulation

### Support Vector Machine

In [None]:
# one vs rest

from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier

model = SVC(kernel = 'rbf', # can change to 'rbf'
          probability = True)
model = MultiOutputClassifier(model)

model.fit(train_x, train_y)

pred = model.predict(test_x)
print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

In [121]:
# label powerset

model = SVC(kernel = 'rbf', # can change to 'rbf'
            )
model = LabelPowerset(model)

model.fit(train_x, train_y)

pred = model.predict(test_x)
print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

Accuracy:  0.011273209549071617
Hamming Loss:  0.10774535809018568


### Evaluation of model

In [177]:
from sklearn.metrics import classification_report
print(classification_report(
    test_y, 
    pred, 
    target_names = categories
))

# Hamming Loss (the lower the better)
from sklearn.metrics import hamming_loss
hl = hamming_loss(test_y, pred)
print("Hamming Loss: ", hl)

# F1 score (the higher the better)
from sklearn.metrics import f1_score
f1 = f1_score(
    test_y, 
    pred, 
    average = "macro"
)
print("F1 Score: ", f1)

# Jaccard Score (the higher the better)
from sklearn.metrics import jaccard_score 
jaccard = jaccard_score(
    test_y, 
    pred, 
    average = "samples"
)
print("Jaccard Score: ", jaccard)

print("Accuracy: ", accuracy_score(test_y, pred))

                                      precision    recall  f1-score   support

       corporate and business topics       0.44      0.13      0.20       134
         labor and employment issues       0.47      0.31      0.37        26
privacy, security, and cyber matters       0.62      0.29      0.39        28
             legal and crime stories       0.19      0.41      0.26       157
  government actions and regulations       0.21      0.31      0.25        86
       technology and digital trends       0.26      0.10      0.15       105
      environment and climate topics       0.41      0.20      0.27        35
          social issues and activism       0.33      0.10      0.15        60
             healthcare and medicine       0.29      0.07      0.11       151
       community and cultural events       0.62      0.05      0.09       109
   international relations and trade       0.52      0.36      0.42       176
              education and learning       0.28      0.16      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
