# MODEL

In [2]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud # for wordcloud
import matplotlib.pyplot as plt # for wordcloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('6_data_country_and_label.csv')

data_x = data["summary"]
data_y = data["theme"]

categories = ["corporate and business topics", 
              "labor and employment issues", 
              "privacy, security, and cyber matters", 
              "legal and crime stories", 
              "government actions and regulations", 
              "technology and digital trends", 
              "environment and climate topics", 
              "social issues and activism", 
              "healthcare and medicine", 
              "community and cultural events", 
              "international relations and trade", 
              "education and learning", 
              "consumer topics", 
              "infrastructure and development", 
              "energy and resources", 
              "political topics and protests", 
              "media and communication", 
              "financial policies and taxation", 
              "human rights and social justice", 
              "science, research, and innovation", 
              "disaster and crisis management", 
              "organized crime and trafficking", 
              "sports, entertainment, and leisure", 
              "other", 
              "military"]

y_encoded = []

for each_theme in data_y:
    each_row = []
    for category in categories:
        if category in each_theme:
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

# convert to dataframe
y_encoded = pd.DataFrame(y_encoded, columns = categories)

print(y_encoded)

      corporate and business topics  labor and employment issues  \
0                                 0                            0   
1                                 0                            0   
2                                 0                            0   
3                                 0                            0   
4                                 0                            0   
...                             ...                          ...   
1503                              0                            0   
1504                              0                            0   
1505                              0                            0   
1506                              0                            0   
1507                              0                            0   

      privacy, security, and cyber matters  legal and crime stories  \
0                                        0                        0   
1                                        

## TFIDF (ngram range test accuracy to tune the preprocessing), countvectorizer (ngram range test accuracy to tune the preprocessing), word2vec, bert (not necessary)

### TDIDF

In [18]:
# word pattern
pattern = r"[a-zA-Z]+"

vectorizer = TfidfVectorizer(
    token_pattern = pattern, 
    stop_words = 'english', 
    ngram_range = (1, 2), # need to do the hyperparameter tuning for this later
    max_features = 100000 # can adjust to larger num too
)

vectorizer.fit(data["summary"])

# tokenization - create matrix
tokenized_features = vectorizer.transform(data["summary"])

# create a dataframe
features = pd.DataFrame(
    data = tokenized_features.toarray(), 
    columns = vectorizer.get_feature_names_out()
)

# print(len(vectorizer.get_feature_names_out()))

data["char_count"] = data["summary"].str.count(r"\S")
data["word_count"] = data["summary"].str.count(pattern)
data["avg_word_length"] = data["char_count"] / data_x["word_count"]

data_x = pd.concat([features, data.loc[:, "char_count": ]], axis = 1)
# data_y = data["theme"]

### Word2Vec - word embeddings

In [None]:
# tokenize all the sentences
tokenized_sentences = [word_tokenize(each_line[0].lower()) for each_line in data]

# train word2vec model
word2vec_model = Word2Vec(
    sentences = tokenized_sentences, 
    vector_size = 1000, 
    window = 5, 
    min_count = 1, 
    workers = 4
)

# generate document vectors
def vectorize_doc(each_line):
    # remove out of vocab words
    words = [word for word in each_line if word in word2vec_model.wv]
    return np.mean(word2vec_model.wv[words], axis = 0) if words else np.zeros(word2vec_model.vector_size)

# create feature vectors 
data_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in data["cleaned_summary"]])
data_y = np.array([each_line.lower() for each_line in data["theme"]])

accuracy:  0.09880636604774536


## logistic regression (softmax regression), support vector machine, bert, decision trees, random forest classifier, gradient boosting algorithm, KNN, neural networks (not necessary for small datasets), naive bayes classifier

In [29]:
train_x, test_x, train_y, test_y = train_test_split(
    data_x, 
    y_encoded, 
    test_size = 0.6, 
    stratify = None, 
    random_state = 59
)

### logistic regression (softmax regression)

In [19]:
train_x, test_x = train_test_split(
    data["summary"], 
    test_size = 0.1, 
    stratify = data["theme"], 
    random_state = 59
)

train_y, test_y = train_test_split(
    y_encoded, 
    test_size = 0.1, 
    stratify = data["theme"], 
    random_state = 59
)

# logistic regression
model = LogisticRegression()
model.fit(train_x, train_y)

# evaluating the model
accuracy = model.score(test_x, test_y)
print("accuracy: " , accuracy)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

### sigmoid activation function

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam

#### train model

In [22]:
vocab_size = 100000
num_of_categories = len(categories)

model = Sequential([
    Embedding(input_dim = vocab_size, 
              output_dim = num_of_categories, 
              input_length = 1000
              ),
    GlobalAveragePooling1D(), 
    Dense(64, activation = "relu"), # hidden layer
    Dense(num_of_categories, activation = "sigmoid")
])

model.compile(optimizer = Adam(), 
              loss = "binary_crossentropy", 
              metrics = ["accuracy"])

model.summary()



#### fit the model

In [30]:
model.fit(train_x, 
          train_y, 
          epochs = 10, 
          batch_size = 32, 
          validation_split = 0.2)

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 124ms/step - accuracy: 0.0025 - loss: 0.4462 - val_accuracy: 0.0000e+00 - val_loss: 0.0414
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 72ms/step - accuracy: 0.0000e+00 - loss: 0.0284 - val_accuracy: 0.0165 - val_loss: 0.0102
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.0096 - loss: 0.0177 - val_accuracy: 0.0165 - val_loss: 0.0098
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.0085 - loss: 0.0115 - val_accuracy: 0.0083 - val_loss: 0.0099
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.0186 - loss: 0.0120 - val_accuracy: 0.0165 - val_loss: 0.0099
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.0054 - loss: 0.0086 - val_accuracy: 0.0165 - val_loss: 0.0096
Epoch 7/10
[1m16/16[0m 

<keras.src.callbacks.history.History at 0x1204af94e30>

#### predict using the model (test_x)

In [38]:
prediction = model.predict(test_x)
pred = (prediction > 0.01).astype(int)
pred_theme = [[categories[i] for i in range(len(categories)) if pred[j, i] == 1] for j in range(len(pred))]

print(pred_theme)

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
[['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations and trade'], ['international relations an

#### Evaluation of model

In [42]:
from sklearn.metrics import classification_report
print(classification_report(
    test_y, 
    pred, 
    target_names = categories
))



# # Hamming Loss (the lower the better)
# from sklearn.metrics import hamming_loss
# hl = hamming_loss(test_y, pred)
# print("Hamming Loss: ", hl)

# # F1 score (the higher the better)
# from sklearn.metrics import f1_score
# f1 = f1_score(
#     test_y, 
#     pred, 
#     average = "macro"
# )
# print("F1 Score: ", f1)


                                      precision    recall  f1-score   support

       corporate and business topics       0.00      0.00      0.00         3
         labor and employment issues       0.00      0.00      0.00         0
privacy, security, and cyber matters       0.00      0.00      0.00         0
             legal and crime stories       0.00      0.00      0.00         0
  government actions and regulations       0.00      0.00      0.00         0
       technology and digital trends       0.00      0.00      0.00         5
      environment and climate topics       0.00      0.00      0.00         2
          social issues and activism       0.00      0.00      0.00         0
             healthcare and medicine       0.00      0.00      0.00         1
       community and cultural events       0.00      0.00      0.00         1
   international relations and trade       0.01      1.00      0.02        10
              education and learning       0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
