### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud # for wordcloud
import matplotlib.pyplot as plt # for wordcloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, hamming_loss, f1_score, jaccard_score, accuracy_score
import pickle

## FEATURE EXTRACTION

### Import Data

In [5]:
# import a more balance train data
train_data = pd.read_csv('updated_more_balanced_data.csv') # obtain balance data from the first 600 data manually
test_data = pd.read_csv('first_600_manually_add_theme.csv')

test_x = test_data["summary"]
test_y = test_data["theme"]

train_x = train_data["summary"]
train_y = train_data["theme"]

categories = ["corporate and business topics", 
              "labor and employment issues", 
              "privacy, security, and cyber matters", 
              "legal and crime stories", 
              "government actions and regulations", 
              "technology and digital trends", 
              "environment and climate topics", 
              "social issues and activism", 
              "healthcare and medicine", 
              "community and cultural events", 
              "international relations and trade", 
              "education and learning", 
              "consumer topics", 
              "infrastructure and development", 
              "energy and resources", 
              "political topics and protests", 
              "media and communication", 
              "financial policies and taxation", 
              "human rights and social justice", 
              "science, research, and innovation", 
              "disaster and crisis management", 
              "organized crime and trafficking", 
              "sports, entertainment, and leisure", 
              "other", 
              "military"]

y_encoded = []
for each_theme in train_y:
    each_row = []
    for category in categories:
        if category in each_theme.lower():
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

# convert to dataframe
y_encoded = pd.DataFrame(y_encoded, columns = categories)
train_y = y_encoded

y_encoded = []
for each_theme in test_y:
    each_row = []
    for category in categories:
        if category in each_theme.lower():
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

y_encoded = pd.DataFrame(y_encoded, columns = categories)
test_y = y_encoded   

### Word2Vec (FINAL)

In [1]:
# tokenize all the sentences
tokenized_sentences = [word_tokenize(each_line[0].lower()) for each_line in train_data["summary"]]

# train word2vec model
word2vec_model = Word2Vec(
    sentences = tokenized_sentences, 
    vector_size = 1000, 
    window = 5, 
    min_count = 1, 
    workers = 4
)

# generate document vectors
def vectorize_doc(each_line):
    # remove out of vocab words
    words = [word for word in each_line if word in word2vec_model.wv]
    return np.mean(word2vec_model.wv[words], axis = 0) if words else np.zeros(word2vec_model.vector_size)

# create feature vectors 
train_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in train_data["summary"]])
test_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in test_data["summary"]])

# save the model
with open("word2vec_model.pkl", "wb") as f:
    pickle.dump(word2vec_model, f)

NameError: name 'train_data' is not defined

### TFIDF

In [7]:
# word pattern
pattern = r"[a-zA-Z]+"

vectorizer = TfidfVectorizer(
    token_pattern = pattern, 
    stop_words = 'english', 
    ngram_range = (1, 2), # need to do the hyperparameter tuning for this later
    max_features = 100000 # can adjust to larger num too
)

vectorizer.fit(test_x)


# tokenization - create matrix
train_tokenized_features = vectorizer.transform(train_data["summary"])
test_tokenized_features = vectorizer.transform(test_data["summary"])


# create a dataframe
train_features = pd.DataFrame(
    data = train_tokenized_features.toarray(), 
    columns = vectorizer.get_feature_names_out()
)
test_features = pd.DataFrame(
    data = test_tokenized_features.toarray(), 
    columns = vectorizer.get_feature_names_out()
)


train_data["char_count"] = train_data["summary"].str.count(r"\S")
train_data["word_count"] = train_data["summary"].str.count(pattern)
train_data["avg_word_length"] = train_data["char_count"] / train_data["word_count"]
train_x = pd.concat([train_features, train_data.loc[:, "char_count": ]], axis = 1)

test_data["char_count"] = test_data["summary"].str.count(r"\S")
test_data["word_count"] = test_data["summary"].str.count(pattern)
test_data["avg_word_length"] = test_data["char_count"] / test_data["word_count"]
test_x = pd.concat([test_features, test_data.loc[:, "char_count": ]], axis = 1)


## MODEL SELECTIONS

### Random Forest Classifier (FINAL)

In [15]:
# label powerset
# it can capture label dependencies but if too many label combinations
# it will lead to poor performance

from skmultilearn.problem_transform import LabelPowerset

model = LabelPowerset(RandomForestClassifier(n_estimators = 100, 
                                             random_state = 59, 
                                             max_depth = 20, 
                                             min_samples_split = 2, 
                                             min_samples_leaf = 3))
model.fit(train_x, train_y)
pred = model.predict(test_x)

# save the model
with open("random_forest_classifier.pkl", "wb") as f:
    pickle.dump(model, f)


### Softmax Regression (One vs All)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
model = OneVsRestClassifier(LogisticRegression())
model.fit(train_x, train_y)

prediction = model.predict(test_x)
pred = (prediction > 0.001).astype(int)
pred_theme = [[categories[i] for i in range(len(categories)) if pred[j, i] == 1] for j in range(len(pred))]

### Support Vector Machine (SVM)

In [None]:
# label powerset

model = SVC(kernel = 'rbf', # can change to 'rbf'
            )
model = LabelPowerset(model)

model.fit(train_x, train_y)

pred = model.predict(test_x)
print("Accuracy: ", accuracy_score(test_y, pred))
print("Hamming Loss: ", hamming_loss(test_y, pred))

### Evaluations of Models

In [17]:
print(classification_report(
    test_y, 
    pred, 
    target_names = categories
))

# Hamming Loss (the lower the better)
print("Hamming Loss: ", hamming_loss(test_y, pred))

# F1 score (the higher the better)
print("F1 Score: ", f1_score(
    test_y, 
    pred, 
    average = "macro"
))

# Jaccard Score (the higher the better)
print("Jaccard Score: ", jaccard_score(
    test_y, 
    pred, 
    average = "samples"
))

# Accuracy
print("Accuracy: ", accuracy_score(test_y, pred))

                                      precision    recall  f1-score   support

       corporate and business topics       0.22      0.04      0.07        51
         labor and employment issues       0.00      0.00      0.00        24
privacy, security, and cyber matters       0.03      0.56      0.05        16
             legal and crime stories       1.00      0.01      0.03        69
  government actions and regulations       0.00      0.00      0.00        52
       technology and digital trends       0.33      0.09      0.14        34
      environment and climate topics       0.00      0.00      0.00         7
          social issues and activism       0.00      0.00      0.00        27
             healthcare and medicine       0.00      0.00      0.00        58
       community and cultural events       0.00      0.00      0.00        22
   international relations and trade       0.00      0.00      0.00       100
              education and learning       0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
