# News Category Classification During Covid-19 Pandemic

### Team members: Yifan Zhang, Tianqi Cao, Li Du

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import re
from tqdm import tqdm
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
data=pd.read_csv('data_before_preprocessing.csv')

In [3]:
len(data)

13082

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13082 entries, 0 to 13081
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   13082 non-null  int64 
 1   author       13082 non-null  object
 2   date         13082 non-null  object
 3   domain       13082 non-null  object
 4   title        13082 non-null  object
 5   url          13082 non-null  object
 6   content      13082 non-null  object
 7   topic_area   13082 non-null  object
 8   domain_code  13082 non-null  int64 
 9   topic_code   13082 non-null  int64 
 10  type         13082 non-null  object
dtypes: int64(3), object(8)
memory usage: 1.1+ MB


In [5]:
import nltk
nltk.download('punkt') # downloads you a model

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
import string

In [7]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

ps = PorterStemmer()
def preprocessing(doc, stemming = True):
    
    sentences = sent_tokenize(doc)
    tokens=[]
    for sent in sentences[:5]:
        text = re.sub('\xa0', '', sent)   
        
        pattern=r'[^a-zA-z\s]'
        text=re.sub(pattern,'',text)
        words = word_tokenize(text)
        
        if stemming:
            words = [ps.stem(w) for w in words]        
        
        tokens=tokens+words
    return [w.lower() for w in tokens if w not in stop]

In [8]:
data['first_5_sent']=data.content.apply(preprocessing)

In [9]:
data.first_5_sent.head()

0    [homag, singaporebas, startup, match, famili, ...
1    [thi, week, i, wa, vaccin, covid, pfizer, mrna...
2    [fund, set, phoeb, wallerbridg, olivia, colman...
3    [lo, angel, counti, depart, public, health, co...
4    [franc, saw, biggest, ever, monthli, drop, job...
Name: first_5_sent, dtype: object

# BOW Binary

### use 'content' as the sole predictor

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
binary = CountVectorizer(lowercase = True,
                       preprocessor = None,
                       tokenizer = preprocessing, 
                       binary = True) # Setting this parameter would make the vector only binary form rather than frequency


y = data.topic_area.values
X1 = binary.fit_transform(data.content) # Transforming doc into binary vector
print(type(X1))

<class 'scipy.sparse.csr.csr_matrix'>


In [11]:
# Spliting dataframe into train, validation and test set with 80%/10%/10% ratio and set seed as 42

X_train1, X_test_vali1, y_train, y_test_vali = train_test_split(X1, y, test_size = 0.2, random_state = 42)
X_vali1, X_test1, y_vali, y_test = train_test_split(X_test_vali1, y_test_vali, test_size = 0.5, random_state = 42)

In [12]:
# Only content
clf = LogisticRegression(random_state=0,
                          n_jobs=-1,
                         max_iter=300,
                         C=0.03).fit(X_train1, y_train)


In [13]:
y_pred = clf.predict(X_test1)

In [14]:
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

                 ai&tech       0.59      0.69      0.64       194
                business       0.57      0.42      0.49       194
construction&environment       0.88      0.64      0.74       131
     consumer&automotive       0.83      0.85      0.84       190
                 finance       0.62      0.70      0.65       187
                 general       0.69      0.71      0.70       221
      science&healthcare       0.73      0.82      0.77       192

                accuracy                           0.69      1309
               macro avg       0.70      0.69      0.69      1309
            weighted avg       0.69      0.69      0.69      1309



### use both 'content' and 'domain' as predictors

In [15]:
X2 = binary.fit_transform(data.domain)

In [16]:
X_train2, X_test_vali2, y_train, y_test_vali = train_test_split(X2, y, test_size = 0.2, random_state = 42)
X_vali2, X_test2, y_vali, y_test = train_test_split(X_test_vali2, y_test_vali, test_size = 0.5, random_state = 42)

In [17]:
from scipy.sparse import coo_matrix, bmat

In [18]:
X_train = bmat([[X_train1,X_train2]])
X_test = bmat([[X_test1, X_test2]])
X_vali = bmat([[X_vali1, X_vali2]])

In [19]:
# Both content and domain
clf = LogisticRegression(random_state=0,
                          n_jobs=-1,
                         max_iter=300,
                         C=0.7).fit(X_train, y_train)


In [20]:
y_pred = clf.predict(X_test)

In [21]:
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

                 ai&tech       0.95      0.99      0.97       194
                business       1.00      0.98      0.99       194
construction&environment       0.99      0.99      0.99       131
     consumer&automotive       0.99      0.99      0.99       190
                 finance       1.00      0.98      0.99       187
                 general       1.00      0.98      0.99       221
      science&healthcare       0.98      0.99      0.99       192

                accuracy                           0.99      1309
               macro avg       0.99      0.99      0.99      1309
            weighted avg       0.99      0.99      0.99      1309



# BOW Term Frequency

### use 'content' as the sole predictor

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
tf = CountVectorizer(lowercase = True,
                       preprocessor = None,
                       tokenizer = preprocessing, 
                       binary = False) # Setting this parameter would make the vector only binary form rather than frequency


y = data.topic_area.values
X1 = tf.fit_transform(data.content) # Transforming doc into tf vector
print(type(X1))

<class 'scipy.sparse.csr.csr_matrix'>


In [23]:
# Spliting dataframe into train, validation and test set with 80%/10%/10% ratio and set seed as 42
X_train1, X_test_vali1, y_train, y_test_vali = train_test_split(X1, y, test_size = 0.2, random_state = 42)
X_vali1, X_test1, y_vali, y_test = train_test_split(X_test_vali1, y_test_vali, test_size = 0.5, random_state = 42)

In [24]:
# Only content
clf = LogisticRegression(random_state=0,
                          n_jobs=-1,
                         max_iter=300,
                         C=0.03).fit(X_train1, y_train)


In [25]:
y_pred = clf.predict(X_test1)

In [26]:
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

                 ai&tech       0.67      0.70      0.68       194
                business       0.57      0.47      0.51       194
construction&environment       0.86      0.69      0.76       131
     consumer&automotive       0.87      0.85      0.86       190
                 finance       0.60      0.65      0.63       187
                 general       0.67      0.69      0.68       221
      science&healthcare       0.72      0.85      0.78       192

                accuracy                           0.70      1309
               macro avg       0.71      0.70      0.70      1309
            weighted avg       0.70      0.70      0.70      1309



### use both content and domain as predictors

In [59]:
X2 = tf.fit_transform(data.domain)

In [60]:
X_train2, X_test_vali2, y_train, y_test_vali = train_test_split(X2, y, test_size = 0.2, random_state = 42)
X_vali2, X_test2, y_vali, y_test = train_test_split(X_test_vali2, y_test_vali, test_size = 0.5, random_state = 42)

In [61]:
X_train = bmat([[X_train1,X_train2]])
X_test = bmat([[X_test1, X_test2]])

In [62]:
clf = LogisticRegression(random_state=0,
                          n_jobs=-1,
                         max_iter=300,
                         C=0.7).fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [63]:
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

                 ai&tech       0.94      0.98      0.96       194
                business       0.98      0.97      0.97       194
construction&environment       0.98      0.98      0.98       131
     consumer&automotive       0.98      0.99      0.98       190
                 finance       1.00      0.96      0.98       187
                 general       1.00      0.97      0.98       221
      science&healthcare       0.97      0.98      0.98       192

                accuracy                           0.98      1309
               macro avg       0.98      0.98      0.98      1309
            weighted avg       0.98      0.98      0.98      1309



# TF-IDF

In [27]:
# count the appearance of each token in all reviews and build a dict: the key is the token, and the item is the frequency
from tqdm import tqdm
from collections import defaultdict

DF = defaultdict(float)
for c in tqdm(data.first_5_sent):
    for token in set(c):
        DF[token] += 1

100%|██████████| 13082/13082 [00:00<00:00, 21322.06it/s]


In [28]:
from math import log
IDF, vocab = dict(), dict()
for token in DF:
    if DF[token] < 55:
        # this becomes an unk
        pass
    else:
        vocab[token] = len(vocab)
        IDF[token] = log(1 + len(data) / DF[token])
        
print(len(DF), len(vocab))

43581 2323


In [29]:
IDF['<UNK>'] = 1
vocab['<UNK>'] = len(vocab)

In [30]:
# define a function that computes weight of each token in a review

def tfidf_feature_extractor(tokens, vocab, IDF):
    for i, token in enumerate(tokens):
        if token not in vocab:
            tokens[i] = '<UNK>'
    TF = defaultdict(int)
    for token in tokens:
        TF[token] += 1
    x = [0] * len(vocab)
    for token in set(tokens):
        tfidf = log(TF[token] + 1) * IDF[token]
        token_id = vocab[token]
        x[token_id] = tfidf 
    return x

In [31]:
# call tfidf_feature_extractor function on each review and get matrixes of all tokens' weight in all reviews

score=[]
for t in tqdm(data.first_5_sent):
    score.append(tfidf_feature_extractor(t, vocab, IDF))

100%|██████████| 13082/13082 [00:01<00:00, 12655.55it/s]


In [32]:
data.columns

Index(['Unnamed: 0', 'author', 'date', 'domain', 'title', 'url', 'content',
       'topic_area', 'domain_code', 'topic_code', 'type', 'first_5_sent'],
      dtype='object')

In [33]:
data['tfidf']=score

In [34]:
from sklearn.preprocessing import LabelBinarizer
lb=LabelBinarizer()
lb.fit(data.domain)
domain_bi = lb.transform(data.domain)

In [37]:
data=data.assign(domain_bi=list(domain_bi))

In [38]:
two_df=pd.concat([data.domain_bi, data.tfidf],axis=1)
data['domain_tfidf']=two_df.apply(lambda row: list(row["domain_bi"])+row["tfidf"],axis=1)

In [39]:
from datetime import datetime
ymd=data.date.apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))
data=data.assign(month=ymd.map(lambda x: x.month))

In [40]:
lb=LabelBinarizer()
lb.fit(data.month)
month_bi = lb.transform(data.month)
data=data.assign(month_bi=list(month_bi))

In [42]:
df=pd.concat([data.tfidf, data.month_bi],axis=1)
data['tfidf_month']=df.apply(lambda row: list(row["tfidf"])+list(row["month_bi"]),axis=1)

In [43]:
X_train, X_vt, y_train, y_vt = train_test_split(data, data.topic_area, test_size = 0.2, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_vt, y_vt, test_size = 0.5, random_state = 42)

### Logistic regression models, use tfidf, tfidf+month, and tfidf+domain as predictor(s)

In [44]:
# use only one variable(tf-idf score) to predict news type

clf_lr1 = LogisticRegression(max_iter=10000, C=0.7).fit(list(X_train['tfidf']), y_train)

In [45]:
lr_pred1 = clf_lr1.predict(list(X_test['tfidf']))
print(classification_report(y_test, lr_pred1))

                          precision    recall  f1-score   support

                 ai&tech       0.59      0.60      0.59       194
                business       0.44      0.43      0.43       194
construction&environment       0.74      0.63      0.68       131
     consumer&automotive       0.84      0.79      0.82       190
                 finance       0.50      0.60      0.55       187
                 general       0.64      0.61      0.63       221
      science&healthcare       0.72      0.75      0.73       192

                accuracy                           0.63      1309
               macro avg       0.64      0.63      0.63      1309
            weighted avg       0.63      0.63      0.63      1309



In [46]:
# use two variables (tf-idf score and month_binary code) to predict news type

clf_lr2 = LogisticRegression(max_iter=10000, C=0.7).fit(list(X_train['tfidf_month']), y_train)

In [47]:
lr_pred2 = clf_lr2.predict(list(X_test['tfidf_month']))

print(classification_report(y_test, lr_pred2))

                          precision    recall  f1-score   support

                 ai&tech       0.60      0.61      0.61       194
                business       0.48      0.48      0.48       194
construction&environment       0.76      0.61      0.68       131
     consumer&automotive       0.84      0.79      0.82       190
                 finance       0.50      0.59      0.54       187
                 general       0.65      0.62      0.64       221
      science&healthcare       0.71      0.77      0.74       192

                accuracy                           0.64      1309
               macro avg       0.65      0.64      0.64      1309
            weighted avg       0.65      0.64      0.64      1309



In [48]:
# use two variables (tf-idf score and domain_binary code) to predict news type

clf_lr3 = LogisticRegression(max_iter=10000, C=0.7).fit(list(X_train['domain_tfidf']), y_train)

In [49]:
lr_pred3 = clf_lr3.predict(list(X_test['domain_tfidf']))

print(classification_report(y_test, lr_pred3))

                          precision    recall  f1-score   support

                 ai&tech       0.86      0.91      0.88       194
                business       0.88      0.89      0.88       194
construction&environment       0.93      0.87      0.90       131
     consumer&automotive       0.92      0.96      0.94       190
                 finance       0.97      0.93      0.95       187
                 general       0.95      0.92      0.93       221
      science&healthcare       0.88      0.89      0.89       192

                accuracy                           0.91      1309
               macro avg       0.91      0.91      0.91      1309
            weighted avg       0.91      0.91      0.91      1309



### XGBoost models using tfidf, tfidf+month, and tfidf+domain as predictor(s)

In [50]:
import xgboost as xgb

In [51]:
clf_xgb1 = xgb.XGBClassifier(max_depth=4,
                            n_estimators=100,
                            objective="multi:softmax",
                            num_class=7, 
                            eval_metric="auc",                            
                            random_state=42).fit(np.vstack(X_train['tfidf']), y_train,  verbose=True) 

In [52]:
xgb_pred1 = clf_xgb1.predict(np.vstack(X_test['tfidf']))

print(classification_report(y_test, xgb_pred1))

                          precision    recall  f1-score   support

                 ai&tech       0.60      0.68      0.64       194
                business       0.65      0.52      0.57       194
construction&environment       0.86      0.73      0.79       131
     consumer&automotive       0.87      0.87      0.87       190
                 finance       0.68      0.70      0.69       187
                 general       0.68      0.72      0.70       221
      science&healthcare       0.78      0.85      0.82       192

                accuracy                           0.72      1309
               macro avg       0.73      0.72      0.73      1309
            weighted avg       0.73      0.72      0.72      1309



In [53]:
clf_xgb2 = xgb.XGBClassifier(max_depth=4,
                            n_estimators=100,
                            objective="multi:softmax",
                            num_class=7, 
                            eval_metric="auc",                            
                            random_state=42).fit(np.vstack(X_train['tfidf_month']), y_train,  verbose=True) 

In [54]:
xgb_pred2 = clf_xgb2.predict(np.vstack(X_test['tfidf_month']))

print(classification_report(y_test, xgb_pred2))

                          precision    recall  f1-score   support

                 ai&tech       0.60      0.63      0.61       194
                business       0.62      0.56      0.59       194
construction&environment       0.81      0.69      0.75       131
     consumer&automotive       0.87      0.88      0.87       190
                 finance       0.68      0.69      0.68       187
                 general       0.72      0.75      0.74       221
      science&healthcare       0.80      0.83      0.82       192

                accuracy                           0.72      1309
               macro avg       0.73      0.72      0.72      1309
            weighted avg       0.72      0.72      0.72      1309



In [55]:
clf_xgb3 = xgb.XGBClassifier(max_depth=4,
                            n_estimators=100,
                            objective="multi:softmax",
                            num_class=7, 
                            eval_metric="auc",                            
                            random_state=42).fit(np.vstack(X_train['domain_tfidf']), y_train,  verbose=True) 

In [56]:
xgb_pred3 = clf_xgb3.predict(np.vstack(X_test['domain_tfidf']))

print(classification_report(y_test, xgb_pred3))

                          precision    recall  f1-score   support

                 ai&tech       0.99      1.00      0.99       194
                business       0.99      0.99      0.99       194
construction&environment       1.00      1.00      1.00       131
     consumer&automotive       1.00      1.00      1.00       190
                 finance       1.00      0.99      1.00       187
                 general       1.00      1.00      1.00       221
      science&healthcare       1.00      1.00      1.00       192

                accuracy                           1.00      1309
               macro avg       1.00      1.00      1.00      1309
            weighted avg       1.00      1.00      1.00      1309



# Word2Vec

In [57]:
import itertools
from collections import Counter
import os
from gensim.models import word2vec
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier

In [58]:
# A function used to learn word embeddings through Word2vec module
def get_embeddings(inp_data, vocabulary_inv, size_features=100,
                   mode='skipgram',
                   min_word_count=2,
                   context=5):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 15  # Number of threads to run in parallel
    downsampling = 1e-3  # Downsample setting for frequent words
    print('Training Word2Vec model...')
    # use inp_data and vocabulary_inv to reconstruct sentences
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == 'skipgram':
        sg = 1
        print('Model: skip-gram')
    elif mode == 'cbow':
        sg = 0
        print('Model: CBOW')
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        sg=sg,
                                        size=size_features,
                                        min_count=min_word_count,
                                        window=context,
                                        sample=downsampling)
    embedding_model.init_sims(replace=True)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model:
            embedding_weights[i] = embedding_model[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25,
                                                     embedding_model.vector_size)
    return embedding_weights

In [59]:
def preprocess_df(df):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row['text']
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [60]:
# A function used to build a vocabulary based on descending word frequencies 
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [61]:
data["text"] = data["content"]
df = preprocess_df(data)

# tokenization 
tagged_data = [word_tokenize(_d) for i, _d in enumerate(df["text"])]
# build vocabulary from tokenized data
word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_data)
# use the above mapping to create input data
inp_data = [[vocabulary[word] for word in text] for text in tagged_data]
# get embedding vector
embedding_weights = get_embeddings(inp_data, vocabulary_inv)


tagged_data = [word_tokenize(_d) for i, _d in enumerate(df["text"])]

data_vec = []
for doc in tagged_data:
    vec = 0
    for w in doc:
        vec += embedding_weights[vocabulary[w]]
    vec = vec / len(doc)
    data_vec.append(vec)


Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding


In [62]:
data['w2v'] = data_vec

In [64]:
X_train, X_vt, y_train, y_vt = train_test_split(data, data.topic_area, test_size = 0.2, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_vt, y_vt, test_size = 0.5, random_state = 42)

### Logistic Regression using 'content' as the only predictor

In [65]:
w2v_lr1 = LogisticRegression(max_iter=1000000, C = 1000, random_state=42).fit(list(X_train['w2v']), y_train)

In [66]:
w2v_lr1_pred = w2v_lr1.predict(list(X_test['w2v']))

print(classification_report(y_test, w2v_lr1_pred))

                          precision    recall  f1-score   support

                 ai&tech       0.62      0.70      0.66       194
                business       0.60      0.52      0.56       194
construction&environment       0.82      0.76      0.79       131
     consumer&automotive       0.75      0.74      0.74       190
                 finance       0.65      0.75      0.70       187
                 general       0.78      0.68      0.73       221
      science&healthcare       0.76      0.82      0.79       192

                accuracy                           0.71      1309
               macro avg       0.71      0.71      0.71      1309
            weighted avg       0.71      0.71      0.71      1309



In [67]:
Cs = [1000, 100, 10, 1, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.005]

In [68]:
from sklearn.linear_model import LogisticRegressionCV
lr_cv = LogisticRegressionCV(Cs=Cs,
                           cv=5,
                           solver='liblinear',
                           scoring='accuracy',
                           random_state=42,
                           n_jobs=3,
                           verbose=3,
                           max_iter=100000000,
                           penalty='l2').fit(list(X_train['w2v']), y_train)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   14.8s
[Parallel(n_jobs=3)]: Done  35 out of  35 | elapsed:   18.5s finished


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [69]:
lr_cv.get_params

<bound method BaseEstimator.get_params of LogisticRegressionCV(Cs=[1000, 100, 10, 1, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01,
                         0.005],
                     cv=5, max_iter=100000000, n_jobs=3, random_state=42,
                     scoring='accuracy', solver='liblinear', verbose=3)>

In [70]:
lr_cv.score(list(X_val['w2v']), y_val)

0.6995412844036697

In [71]:
lr_pred = lr_cv.predict(list(X_test['w2v']))

print(classification_report(y_test, lr_pred))

                          precision    recall  f1-score   support

                 ai&tech       0.63      0.71      0.66       194
                business       0.66      0.49      0.56       194
construction&environment       0.83      0.79      0.81       131
     consumer&automotive       0.77      0.76      0.76       190
                 finance       0.64      0.76      0.69       187
                 general       0.79      0.70      0.74       221
      science&healthcare       0.75      0.82      0.79       192

                accuracy                           0.72      1309
               macro avg       0.72      0.72      0.72      1309
            weighted avg       0.72      0.72      0.71      1309



### Neural Network using 'content' as the only predictor

In [72]:
# Neural net from SKLEARN
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(
    activation="tanh",
    solver="lbfgs",
    alpha=0.01,
    hidden_layer_sizes=(3,),
    random_state=42,
    max_iter=10000,
).fit(list(X_train['w2v']), y_train)

In [73]:
nn.score(list(X_val['w2v']), y_val)

0.5986238532110092

In [74]:
from sklearn.model_selection import GridSearchCV
# alpha is the level of regularization
hls = [(6,), (7,), (8, )]
param_grid = {"hidden_layer_sizes": hls, "alpha": [0.01, 0.001]}
#scoring = {"AUC": "roc_auc"}

nn_cv = GridSearchCV(
    nn, param_grid, scoring='accuracy', cv=5, n_jobs=4, verbose=5
).fit(list(X_train['w2v']), y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [75]:
nn_cv.best_params_

{'alpha': 0.01, 'hidden_layer_sizes': (8,)}

In [76]:
nn_cv.score(list(X_val['w2v']), y_val)

0.713302752293578

In [77]:
nn_pred = nn_cv.predict(list(X_test['w2v']))

print(classification_report(y_test, nn_pred))

                          precision    recall  f1-score   support

                 ai&tech       0.65      0.77      0.71       194
                business       0.63      0.58      0.60       194
construction&environment       0.85      0.81      0.83       131
     consumer&automotive       0.81      0.79      0.80       190
                 finance       0.70      0.74      0.72       187
                 general       0.82      0.67      0.74       221
      science&healthcare       0.76      0.84      0.80       192

                accuracy                           0.74      1309
               macro avg       0.75      0.74      0.74      1309
            weighted avg       0.74      0.74      0.74      1309



### Logistic regression model use 'content' and 'domain' as predictors

In [78]:
from sklearn.preprocessing import LabelBinarizer
lb=LabelBinarizer()
lb.fit(data.domain)
domain_bi = lb.transform(data.domain)

data=data.assign(domain_bi=list(domain_bi))
two_df=pd.concat([data['domain_bi'], data['w2v']],axis=1)

In [79]:
data['domain_w2v']=two_df.apply(lambda row: list(row["domain_bi"])+list(row["w2v"]),axis=1)

In [80]:
X_train, X_vt, y_train, y_vt = train_test_split(data, data.topic_area, test_size = 0.2, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_vt, y_vt, test_size = 0.5, random_state = 42)

In [81]:
lr_domain = LogisticRegression(max_iter=1000000, C = 1000, random_state=42).fit(list(X_train['domain_w2v']), y_train)

In [82]:
lr_pred_domain = lr_domain.predict(list(X_test['domain_w2v']))

print(classification_report(y_test, lr_pred_domain))

                          precision    recall  f1-score   support

                 ai&tech       1.00      1.00      1.00       194
                business       1.00      1.00      1.00       194
construction&environment       1.00      1.00      1.00       131
     consumer&automotive       1.00      1.00      1.00       190
                 finance       1.00      1.00      1.00       187
                 general       1.00      1.00      1.00       221
      science&healthcare       1.00      1.00      1.00       192

                accuracy                           1.00      1309
               macro avg       1.00      1.00      1.00      1309
            weighted avg       1.00      1.00      1.00      1309



## GloVe

In [86]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = 'glove.6B/glove.6B.100d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)

model = KeyedVectors.load_word2vec_format(tmp_file)

In [87]:
from numpy import dot
from numpy.linalg import norm

def doc2vec(doc, wv):
    vecs = []
    for token in doc.split():
        try:
            vecs.append(wv[token])
        except KeyError:
            pass
    if len(vecs) == 0:
        return np.zeros(100)
    else:
        return np.array(np.mean(vecs, axis = 0))

In [88]:
X = []
for doc in tqdm(data.content):
    X.append(doc2vec(doc, model))
y = list(data['topic_area'])
print(len(X), len(y))

100%|██████████| 13082/13082 [00:22<00:00, 580.17it/s]

13082 13082





In [89]:
data['glove'] = X

In [90]:
X_train, X_vt, y_train, y_vt = train_test_split(data, data.topic_area, test_size = 0.2, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_vt, y_vt, test_size = 0.5, random_state = 42)

### Logistic Regression using 'content' as sole predictor

In [91]:
lr_glove = LogisticRegression(max_iter=1000000, C = 1, random_state=42).fit(list(X_train['glove']), y_train)

In [99]:
from sklearn.metrics import classification_report
lr_pred0_glove = lr_glove.predict(list(X_val['glove']))

print(classification_report(y_val, lr_pred0_glove))

                          precision    recall  f1-score   support

                 ai&tech       0.50      0.51      0.51       204
                business       0.54      0.33      0.41       207
construction&environment       0.72      0.60      0.65       105
     consumer&automotive       0.60      0.65      0.62       190
                 finance       0.59      0.61      0.60       196
                 general       0.60      0.74      0.66       198
      science&healthcare       0.65      0.71      0.68       208

                accuracy                           0.59      1308
               macro avg       0.60      0.59      0.59      1308
            weighted avg       0.59      0.59      0.58      1308



In [100]:
Cs = [1000, 100, 10, 1, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.005]

In [101]:
from sklearn.linear_model import LogisticRegressionCV
lr_cv_glove= LogisticRegressionCV(Cs=Cs,
                           cv=5,
                           solver='liblinear',
                           scoring='accuracy',
                           random_state=42,
                           n_jobs=3,
                           verbose=3,
                           max_iter=100000000,
                           penalty='l2').fit(list(X_train['glove']), y_train)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   20.9s
[Parallel(n_jobs=3)]: Done  35 out of  35 | elapsed:   27.6s finished


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [103]:
lr_pred_glove = lr_cv_glove.predict(list(X_test['glove']))

print(classification_report(y_test, lr_pred_glove))

                          precision    recall  f1-score   support

                 ai&tech       0.61      0.63      0.62       194
                business       0.46      0.34      0.39       194
construction&environment       0.84      0.69      0.76       131
     consumer&automotive       0.70      0.68      0.69       190
                 finance       0.52      0.62      0.57       187
                 general       0.68      0.75      0.71       221
      science&healthcare       0.73      0.82      0.78       192

                accuracy                           0.65      1309
               macro avg       0.65      0.65      0.65      1309
            weighted avg       0.64      0.65      0.64      1309

