### Import Libraries

In [13]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud # for wordcloud
import matplotlib.pyplot as plt # for wordcloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, hamming_loss, f1_score, jaccard_score, accuracy_score
from itertools import product
from skmultilearn.problem_transform import LabelPowerset
from sklearn.model_selection import GridSearchCV
from skmultilearn.problem_transform import LabelPowerset
from sklearn.metrics import make_scorer, f1_score

### Import Data

In [5]:
# import a more balance train data
train_data = pd.read_csv('updated_more_balanced_data.csv') # obtain balance data from the first 600 data manually
test_data = pd.read_csv('first_600_manually_add_theme.csv')

test_x = test_data["summary"]
test_y = test_data["theme"]

train_x = train_data["summary"]
train_y = train_data["theme"]

categories = ["corporate and business topics", 
              "labor and employment issues", 
              "privacy, security, and cyber matters", 
              "legal and crime stories", 
              "government actions and regulations", 
              "technology and digital trends", 
              "environment and climate topics", 
              "social issues and activism", 
              "healthcare and medicine", 
              "community and cultural events", 
              "international relations and trade", 
              "education and learning", 
              "consumer topics", 
              "infrastructure and development", 
              "energy and resources", 
              "political topics and protests", 
              "media and communication", 
              "financial policies and taxation", 
              "human rights and social justice", 
              "science, research, and innovation", 
              "disaster and crisis management", 
              "organized crime and trafficking", 
              "sports, entertainment, and leisure", 
              "other", 
              "military"]

y_encoded = []
for each_theme in train_y:
    each_row = []
    for category in categories:
        if category in each_theme.lower():
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

# convert to dataframe
y_encoded = pd.DataFrame(y_encoded, columns = categories)
train_y = y_encoded

y_encoded = []
for each_theme in test_y:
    each_row = []
    for category in categories:
        if category in each_theme.lower():
            each_row.append(1)
        else:
            each_row.append(0)
    y_encoded.append(each_row)

y_encoded = pd.DataFrame(y_encoded, columns = categories)
test_y = y_encoded   

# FEATURE SELECTION - WORD2VEC

In [6]:
# tokenize all the sentences
tokenized_sentences = [word_tokenize(each_line[0].lower()) for each_line in train_data["summary"]]

# train word2vec model
word2vec_model = Word2Vec(
    sentences = tokenized_sentences, 
    vector_size = 1000, 
    window = 5, 
    min_count = 1, 
    workers = 4
)

# generate document vectors
def vectorize_doc(each_line):
    # remove out of vocab words
    words = [word for word in each_line if word in word2vec_model.wv]
    return np.mean(word2vec_model.wv[words], axis = 0) if words else np.zeros(word2vec_model.vector_size)

# create feature vectors 
train_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in train_data["summary"]])
test_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in test_data["summary"]])

tuning using Grid Search

need to tune:

vector_size, window, min_count, sg, hs and negative, epochs, alpha and min_alpha, 

In [11]:
vector_sizes = [100, 200, 300]
windows = [3, 5, 7, 10]
min_counts = [1, 2, 3, 4, 5]
sgs = [0, 1]
hs = [0, 1]

tuned_model = None
tuned_score = -float('inf')


highest = 0
for vector_size, window, min_count, sg, each_hs in product(vector_sizes, windows, min_counts, sgs, hs):
    model = Word2Vec(train_data["summary"], 
                      vector_size = vector_size, 
                      window = window, 
                      min_count = min_count, 
                      sg = sg, 
                      hs = each_hs
                      )
    
    # create feature vectors 
    train_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in train_data["summary"]])
    test_x = np.array([vectorize_doc(word_tokenize(each_line.lower())) for each_line in test_data["summary"]])

    # label powerset
    # it can capture label dependencies but if too many label combinations
    # it will lead to poor performance



    model = LabelPowerset(RandomForestClassifier(n_estimators = 100, 
                                                random_state = 59))
    model.fit(train_x, train_y)
    pred = model.predict(test_x)
    if accuracy_score(test_y, pred) > highest:
        highest = accuracy_score(test_y, pred)
        print("Accuracy: ", highest)
        print(vector_size, window, min_count, sg, each_hs)


F1 Score:  0.04257029768172584
100 3 1 0 0


The metrics do not change throughout the all the tuning. Hence, the current hyperparameters do not need to be changed.

# MODEL - RANDOM FOREST CLASSIFIER

In [17]:
base_model = LabelPowerset(RandomForestClassifier(n_estimators = 100,
                                                  max_depth = 25, 
                                                  min_samples_split = 5, 
                                                  min_samples_leaf = 2,  
                                                  random_state = 59))

parameters = {
    'classifier__n_estimators': [50, 100, 200], 
    'classifier__max_depth': [None, 10, 20], 
    'classifier__min_samples_split': [2, 5, 10], 
    'classifier__min_samples_leaf': [1, 2, 4]
} 

grid_search = GridSearchCV(
    base_model, 
    parameters,
    scoring = make_scorer(
        f1_score, 
        average = 'micro'), 
        cv = 3,
        n_jobs = -1
    )

grid_search.fit(train_x, train_y)
print(grid_search.predict(test_x))
print("best parameters: ", grid_search.best_params_)

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan]


  (0, 2)	1
  (1, 3)	1
  (2, 2)	1
  (3, 3)	1
  (4, 18)	1
  (5, 2)	1
  (6, 2)	1
  (7, 21)	1
  (8, 2)	1
  (9, 2)	1
  (10, 2)	1
  (11, 2)	1
  (12, 2)	1
  (13, 2)	1
  (14, 3)	1
  (15, 2)	1
  (16, 2)	1
  (17, 2)	1
  (18, 21)	1
  (19, 2)	1
  (20, 2)	1
  (21, 2)	1
  (22, 2)	1
  (23, 2)	1
  (24, 2)	1
  (25, 2)	1
  (26, 18)	1
  (27, 12)	1
  (28, 2)	1
  (29, 0)	1
  (29, 5)	1
  (30, 2)	1
  (31, 2)	1
  (32, 2)	1
  (33, 2)	1
  (34, 2)	1
  (35, 2)	1
  (36, 2)	1
  (37, 2)	1
  (38, 5)	1
  (39, 2)	1
  (40, 21)	1
  (41, 5)	1
  (42, 18)	1
  (43, 2)	1
  (44, 2)	1
  (45, 2)	1
  (46, 18)	1
  (47, 2)	1
  (48, 2)	1
  (49, 18)	1
  (50, 2)	1
  (51, 2)	1
  (52, 18)	1
  (53, 18)	1
  (54, 2)	1
  (55, 2)	1
  (56, 21)	1
  (57, 18)	1
  (58, 18)	1
  (59, 2)	1
  (60, 2)	1
  (61, 18)	1
  (62, 18)	1
  (63, 2)	1
  (64, 18)	1
  (65, 2)	1
  (66, 2)	1
  (67, 18)	1
  (68, 18)	1
  (69, 2)	1
  (70, 18)	1
  (71, 2)	1
  (72, 2)	1
  (73, 18)	1
  (74, 2)	1
  (75, 18)	1
  (76, 21)	1
  (77, 18)	1
  (78, 2)	1
  (79, 2)	1
  (80, 2)	1
  