# MODEL

## import libraries

In [150]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud # for wordcloud
import matplotlib.pyplot as plt # for wordcloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

## data processing

In [151]:
test_data = pd.read_csv('6_data_country_and_label.csv')
# import a more balance train data
train_data = pd.read_csv('updated_more_balance_data.csv') # find balance data from the first 600 data

test_x = test_data["summary"]
test_y = test_data["theme"]

train_x = train_data["summary"]
train_y = train_data["theme"]

categories = ["corporate and business topics", 
              "labor and employment issues", 
              "privacy, security, and cyber matters", 
              "legal and crime stories", 
              "government actions and regulations", 
              "technology and digital trends", 
              "environment and climate topics", 
              "social issues and activism", 
              "healthcare and medicine", 
              "community and cultural events", 
              "international relations and trade", 
              "education and learning", 
              "consumer topics", 
              "infrastructure and development", 
              "energy and resources", 
              "political topics and protests", 
              "media and communication", 
              "financial policies and taxation", 
              "human rights and social justice", 
              "science, research, and innovation", 
              "disaster and crisis management", 
              "organized crime and trafficking", 
              "sports, entertainment, and leisure", 
              "other", 
              "military"]

y_encoded = []
for each_theme in train_y:
    each_row = []
    for category in categories:
        if category in each_theme.lower():
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

# convert to dataframe
y_encoded = pd.DataFrame(y_encoded, columns = categories)
train_y = y_encoded

y_encoded = []
for each_theme in test_y:
    each_row = []
    for category in categories:
        if category in each_theme.lower():
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

y_encoded = pd.DataFrame(y_encoded, columns = categories)
test_y = y_encoded   

FileNotFoundError: [Errno 2] No such file or directory: 'updated_more_balance_data.csv'

In [35]:
rows = np.where(np.sum(y_encoded, axis = 1) > 1)[0]
# print(rows)

theme_counts = y_encoded.to_numpy().argmax(axis = 1)
print(theme_counts)
unique_classes, counts = np.unique(theme_counts, 
                                   return_counts = True)
print(counts)
print(unique_classes)
print(len(unique_classes))


[ 1 17  2 ...  3  4 17]
[136  26  28 157  83 101  34  57 151 109 166  43  31  15  13  56  25  33
  16  49  16   7  85  36  35]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
25


## TFIDF (ngram range test accuracy to tune the preprocessing), countvectorizer (ngram range test accuracy to tune the preprocessing), word2vec, bert (not necessary)

### TDIDF

In [71]:
# word pattern
pattern = r"[a-zA-Z]+"

vectorizer = TfidfVectorizer(
    token_pattern = pattern, 
    stop_words = 'english', 
    ngram_range = (1, 2), # need to do the hyperparameter tuning for this later
    max_features = 100000 # can adjust to larger num too
)

vectorizer.fit(test_x)


# tokenization - create matrix
train_tokenized_features = vectorizer.transform(train_data["summary"])
test_tokenized_features = vectorizer.transform(test_data["summary"])


# create a dataframe
train_features = pd.DataFrame(
    data = train_tokenized_features.toarray(), 
    columns = vectorizer.get_feature_names_out()
)
test_features = pd.DataFrame(
    data = test_tokenized_features.toarray(), 
    columns = vectorizer.get_feature_names_out()
)


# print(len(vectorizer.get_feature_names_out()))

train_data["char_count"] = train_data["summary"].str.count(r"\S")
train_data["word_count"] = train_data["summary"].str.count(pattern)
train_data["avg_word_length"] = train_data["char_count"] / train_data["word_count"]

train_x = pd.concat([train_features, train_data.loc[:, "char_count": ]], axis = 1)
# data_y = data["theme"]

test_data["char_count"] = test_data["summary"].str.count(r"\S")
test_data["word_count"] = test_data["summary"].str.count(pattern)
test_data["avg_word_length"] = test_data["char_count"] / test_data["word_count"]

test_x = pd.concat([test_features, test_data.loc[:, "char_count": ]], axis = 1)


AttributeError: 'numpy.ndarray' object has no attribute 'lower'

### Word2Vec - word embeddings

In [93]:
# tokenize all the sentences
tokenized_sentences = [word_tokenize(each_line[0].lower()) for each_line in train_data["summary"]]

# train word2vec model
word2vec_model = Word2Vec(
    sentences = tokenized_sentences, 
    vector_size = 1000, 
    window = 5, 
    min_count = 1, 
    workers = 4
)

# generate document vectors
def vectorize_doc(each_line):
    # remove out of vocab words
    words = [word for word in each_line if word in word2vec_model.wv]
    return np.mean(word2vec_model.wv[words], axis = 0) if words else np.zeros(word2vec_model.vector_size)

# create feature vectors 
train_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in train_data["summary"]])
test_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in test_data["summary"]])
# train_y = np.array([each_line.lower() for each_line in train_data["theme"]])

## logistic regression (softmax regression), support vector machine, bert, decision trees, random forest classifier, gradient boosting algorithm, KNN, neural networks (not necessary for small datasets), naive bayes classifier

### train test split

In [29]:
train_x, test_x, train_y, test_y = train_test_split(
    data_x, 
    y_encoded, 
    test_size = 0.6, 
    stratify = None, 
    random_state = 59
)

In [56]:
from sklearn.model_selection import StratifiedKFold

s = StratifiedKFold(n_splits = 2, 
                    shuffle = True, 
                    random_state = 59)

for train_i, test_i in s.split(data_x, y_encoded.to_numpy().argmax(axis = 1)):
    train_x, test_x = data_x.iloc[train_i], data_x.iloc[test_i]
    train_y, test_y = y_encoded.iloc[train_i], y_encoded.iloc[test_i]


print(len(train_x))
print(len(train_y))



754
754


### logistic regression one-vs-all OvA (softmax regression)

In [94]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

model = LogisticRegression(
    max_iter = 200, 
    random_state = 59
)
model = MultiOutputClassifier(model)

model.fit(train_x, train_y)
model.predict(test_x)

pred = model.predict(test_x)

# print("Accuracy: ", accuracy_score(test_y, pred))
# print("Hamming Loss: ", hamming_loss(test_y, pred))

In [95]:
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(LogisticRegression())
model.fit(train_x, train_y)

In [96]:
prediction = model.predict(test_x)
pred = (prediction > 0.001).astype(int)
pred_theme = [[categories[i] for i in range(len(categories)) if pred[j, i] == 1] for j in range(len(pred))]

# print(pred_theme)

# print("F1 Score: ", f1_score(test_y, pred, average = "micro"))
# print("Accuracy: ", accuracy_score(test_y, pred))
# print("Hamming Loss: ", hamming_loss(test_y, pred))

### sigmoid activation function

In [129]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam

#### train model

In [130]:
vocab_size = 100000
num_of_categories = len(categories)

model = Sequential([
    Embedding(input_dim = vocab_size, 
              output_dim = num_of_categories, 
              input_length = 1000
              ),
    GlobalAveragePooling1D(), 
    Dense(64, activation = "relu"), # hidden layer
    Dense(num_of_categories, activation = "sigmoid")
])

model.compile(optimizer = Adam(), 
              loss = "binary_crossentropy", 
              metrics = ["accuracy"])

model.summary()



#### fit the model

In [131]:
model.fit(train_x, 
          train_y, 
          epochs = 10, 
          batch_size = 60, 
          validation_split = 0.1)

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 134ms/step - accuracy: 0.0254 - loss: 0.6889 - val_accuracy: 0.0000e+00 - val_loss: 0.6456
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.0199 - loss: 0.6328 - val_accuracy: 0.0000e+00 - val_loss: 0.5888
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.0254 - loss: 0.5710 - val_accuracy: 0.0000e+00 - val_loss: 0.5371
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.0254 - loss: 0.5121 - val_accuracy: 0.0000e+00 - val_loss: 0.4889
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.1249 - loss: 0.4589 - val_accuracy: 0.0000e+00 - val_loss: 0.4485
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1305 - loss: 0.4124 - val_accuracy: 0.0000e+00 - val_loss: 0.4150
Epoch 7/10
[1m2/2[0

<keras.src.callbacks.history.History at 0x19a86bfdee0>

#### predict using the model (test_x)

In [132]:
print(test_x)

[[ 2.57017731e-04  8.46529001e-05 -2.53944047e-04 ... -7.62878160e-04
  -5.78860054e-04  1.23417733e-04]
 [ 7.08879728e-04 -1.56793001e-04  7.94749882e-04 ...  7.31326814e-04
   5.46556839e-04  9.24946216e-04]
 [ 2.57017731e-04  8.46529001e-05 -2.53944047e-04 ... -7.62878160e-04
  -5.78860054e-04  1.23417733e-04]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 2.57017731e-04  8.46529001e-05 -2.53944047e-04 ... -7.62878160e-04
  -5.78860054e-04  1.23417733e-04]]


In [136]:
prediction = model.predict(test_x)
print(prediction)
pred = (prediction > 0.3).astype(int)
pred_theme = [[categories[i] for i in range(len(categories)) if pred[j, i] == 1] for j in range(len(pred))]

print(pred_theme)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[[0.04300688 0.06980865 0.01327056 ... 0.12686446 0.1883062  0.09959643]
 [0.04300694 0.06980877 0.01327049 ... 0.12686437 0.1883054  0.09959605]
 [0.04300687 0.06980867 0.01327047 ... 0.12686428 0.18830527 0.09959591]
 ...
 [0.04300694 0.06980877 0.01327049 ... 0.12686437 0.1883054  0.09959605]
 [0.04300687 0.06980867 0.01327047 ... 0.12686428 0.18830527 0.09959591]
 [0.04300679 0.06980862 0.01327048 ... 0.12686422 0.18830532 0.09959583]]
[['legal and crime stories', 'infrastructure and development'], ['legal and crime stories', 'infrastructure and development'], ['legal and crime stories', 'infrastructure and development'], ['legal and crime stories', 'infrastructure and development'], ['legal and crime stories', 'infrastructure and development'], ['legal and crime stories', 'infrastructure and development'], ['legal and crime stories', 'infrastructure and development'], ['legal and crime stories', 'infrastructu

### Random forest classifier

In [124]:
# binary relevance
# quite simple to understand, but it will ignore dependencies between labels 

from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators = 100, 
                               random_state = 59)
model = MultiOutputClassifier(model)

model.fit(train_x, train_y)

pred = model.predict(test_x)

print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

Accuracy:  0.008620689655172414
Hamming Loss:  0.04286472148541114


In [145]:
# label powerset
# it can capture label dependencies but if too many label combinations
# it will lead to poor performance

from skmultilearn.problem_transform import LabelPowerset

model = LabelPowerset(RandomForestClassifier(n_estimators = 100, 
                                             random_state = 59))

model.fit(train_x, train_y)
pred = model.predict(test_x)

print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

Accuracy:  0.01989389920424403
Hamming Loss:  0.10517241379310345


In [149]:
prediction = model.predict(test_x)
print(prediction)
pred = (prediction > 0.3).astype(int)
pred_theme = [[categories[i] for i in range(len(categories)) if pred[j, i] == 1] for j in range(len(pred))]

print(pred_theme)

  (0, 10)	1
  (0, 24)	1
  (1, 3)	1
  (2, 10)	1
  (2, 24)	1
  (3, 3)	1
  (4, 4)	1
  (4, 10)	1
  (5, 10)	1
  (5, 24)	1
  (6, 10)	1
  (6, 24)	1
  (7, 21)	1
  (8, 10)	1
  (8, 24)	1
  (9, 10)	1
  (9, 24)	1
  (10, 10)	1
  (10, 24)	1
  (11, 10)	1
  (11, 24)	1
  (12, 10)	1
  (12, 24)	1
  (13, 10)	1
  (13, 24)	1
  (14, 3)	1
  (15, 10)	1
  (15, 24)	1
  (16, 10)	1
  (16, 24)	1
  (17, 10)	1
  (17, 24)	1
  (18, 1)	1
  (19, 10)	1
  (19, 24)	1
  (20, 10)	1
  (20, 24)	1
  (21, 10)	1
  (21, 24)	1
  (22, 10)	1
  (22, 24)	1
  (23, 10)	1
  (23, 24)	1
  (24, 10)	1
  (24, 24)	1
  (25, 10)	1
  (25, 24)	1
  (26, 4)	1
  (26, 10)	1
  (27, 12)	1
  (28, 10)	1
  (28, 24)	1
  (29, 0)	1
  (29, 5)	1
  (30, 10)	1
  (30, 24)	1
  (31, 10)	1
  (31, 24)	1
  (32, 10)	1
  (32, 24)	1
  (33, 10)	1
  (33, 24)	1
  (34, 10)	1
  (34, 24)	1
  (35, 10)	1
  (35, 24)	1
  (36, 10)	1
  (36, 24)	1
  (37, 10)	1
  (37, 24)	1
  (38, 5)	1
  (39, 10)	1
  (39, 24)	1
  (40, 1)	1
  (41, 5)	1
  (42, 4)	1
  (42, 10)	1
  (43, 10)	1
  (43, 24)	1
  

### Support Vector Machine

In [120]:
# one vs rest

from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier

model = SVC(kernel = 'rbf', # can change to 'rbf'
          probability = True)
model = MultiOutputClassifier(model)

model.fit(train_x, train_y)

pred = model.predict(test_x)
print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

Accuracy:  0.001326259946949602
Hamming Loss:  0.04151193633952255


In [121]:
# label powerset

model = SVC(kernel = 'rbf', # can change to 'rbf'
            )
model = LabelPowerset(model)

model.fit(train_x, train_y)

pred = model.predict(test_x)
print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

Accuracy:  0.011273209549071617
Hamming Loss:  0.10774535809018568


### Evaluation of model

In [137]:
from sklearn.metrics import classification_report
print(classification_report(
    test_y, 
    pred, 
    target_names = categories
))

# Hamming Loss (the lower the better)
from sklearn.metrics import hamming_loss
hl = hamming_loss(test_y, pred)
print("Hamming Loss: ", hl)

# F1 score (the higher the better)
from sklearn.metrics import f1_score
f1 = f1_score(
    test_y, 
    pred, 
    average = "macro"
)
print("F1 Score: ", f1)

# Jaccard Score (the higher the better)
from sklearn.metrics import jaccard_score 
jaccard = jaccard_score(
    test_y, 
    pred, 
    average = "samples"
)
print("Jaccard Score: ", jaccard)

print("Accuracy: ", accuracy_score(test_y, pred))

                                      precision    recall  f1-score   support

       corporate and business topics       0.00      0.00      0.00       134
         labor and employment issues       0.00      0.00      0.00        26
privacy, security, and cyber matters       0.00      0.00      0.00        28
             legal and crime stories       0.10      1.00      0.19       157
  government actions and regulations       0.00      0.00      0.00        86
       technology and digital trends       0.00      0.00      0.00       105
      environment and climate topics       0.00      0.00      0.00        35
          social issues and activism       0.00      0.00      0.00        60
             healthcare and medicine       0.00      0.00      0.00       151
       community and cultural events       0.00      0.00      0.00       109
   international relations and trade       0.00      0.00      0.00       176
              education and learning       0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### check for mutli label examples