In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import TfidfModel
import re
from tqdm import tqdm

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'],
                                      remove=("header",))

In [2]:
NOUN_TAG = ['NN', "NNS"]
JJ_TAG = ["JJ", "JJR", "JJS"]
VB_TAG = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
RB_TAG = ["RB", "RBR", "RBS"]

In [16]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/", "^"
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#", "="]
stop_words = stopwords.words('english') + special_sym


def delete_stopword_and_lemmatize(listw):
    res = []
    for word in listw:
        word = lemmatizer.lemmatize(word.lower())
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None\
            and not any(sym in word for sym in special_sym):
            res += [word]
    return res


def only_pos(list_doc, pos_t_cls):
    len_docs = len(list_doc)
    cnt_cls = len(pos_t_cls)
    res = [[] for i in range(cnt_cls)]
    
    with tqdm(total=len_docs, position=0, leave=True) as pbar:
        for idxd in range(len_docs):

            pbar.set_description(f"Doc: {idxd+1}/{len_docs}")
            pbar.update()

            ndoc = [[] for i in range(cnt_cls)]
            for elem in nltk.pos_tag(list_doc[idxd]):
                for idxpos in range(cnt_cls):
                    if elem[1] in pos_t_cls[idxpos]:
                        ndoc[idxpos] += [elem[0]]
                        
            for idxpos in range(cnt_cls):
                res[idxpos] += [ndoc[idxpos]]
    return tuple(res)

In [4]:
def split_data(x):
    return train_test_split(x, newsgroups_train.target, test_size=0.2, random_state=42)

In [57]:
def get_res(typename, dataset):
    x_train, x_test, y_train, y_test = split_data(dataset)
    print(typename)

    clf = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"    Random Forest:     {round(acc_test, 3)}")

    clf = GradientBoostingClassifier(random_state=42, n_estimators=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"    Gradient Boosting: {round(acc_test, 3)}")

In [63]:
def get_bow_var(bow, dictn):
    res = []
    for i in range(len(bow)):
        doc = [0 for j in range(len(dictn))]
        for val in bow[i]:
            doc[val[0]] = val[1]

        res += [doc]
    return res

def get_tfidf_var(tfidf, dictn):
    res = []
    for i in range(len(tfidf)):
        doc = [0 for j in range(len(dictn))]
        for val in tfidf[i]:
            doc[val[0]] = val[1]

        res += [doc]
    return res

In [83]:
def get_lsi(corpus, dictn):
    lsi_model = models.LsiModel(corpus=corpus, num_topics=20, id2word=dictn)
    lsi_res = []
    for i in range(len(corpus)):
        lsi_res += [[val[1] for val in lsi_model[corpus[i]]]]
    return lsi_res

def get_lda(corpus, dictn, alpha, bbeta):
    lda_model = models.LdaModel(corpus=corpus, num_topics=20, id2word=dictn, passes=10, alpha=alpha, eta=bbeta)
    lda_res = []
    for i in range(len(corpus)):
        lda_res += [[val[1] for val in lda_model.get_document_topics(corpus[i], minimum_probability=0.0)]]
    return lda_res

In [5]:
tokenize_data = [delete_stopword_and_lemmatize(nltk.word_tokenize(newsgroups_train.data[i]))
                 for i in range(len(newsgroups_train.data))]

In [28]:
tokenize_data_noun, tokenize_data_noun_jj, tokenize_data_noun_jj_vb, tokenize_data_noun_rb = only_pos(tokenize_data, 
                                                       [NOUN_TAG, NOUN_TAG + JJ_TAG, NOUN_TAG + JJ_TAG + VB_TAG, NOUN_TAG + RB_TAG])

Doc: 4740/4740: 100%|██████████████████████████████████████████████████████████████| 4740/4740 [01:22<00:00, 57.53it/s]


In [35]:
dictn_noun, dictn_noun_jj, dictn_noun_jj_vb, dictn_noun_rb = corpora.Dictionary(tokenize_data_noun),\
    corpora.Dictionary(tokenize_data_noun_jj), corpora.Dictionary(tokenize_data_noun_jj_vb), corpora.Dictionary(tokenize_data_noun_rb)

dictn_noun.filter_extremes(keep_n=1000)
dictn_noun_jj.filter_extremes(keep_n=1000)
dictn_noun_jj_vb.filter_extremes(keep_n=1000)
dictn_noun_rb.filter_extremes(keep_n=1000)

bow_noun = [dictn_noun.doc2bow(doc) for doc in tokenize_data_noun]
bow_noun_jj = [dictn_noun_jj.doc2bow(doc) for doc in tokenize_data_noun_jj]
bow_noun_jj_vb = [dictn_noun_jj_vb.doc2bow(doc) for doc in tokenize_data_noun_jj_vb]
bow_noun_rb = [dictn_noun_rb.doc2bow(doc) for doc in tokenize_data_noun_rb]

model = TfidfModel(bow_noun)
tfidf_noun = [model[doc] for doc in bow_noun]

model = TfidfModel(bow_noun_jj)
tfidf_noun_jj = [model[doc] for doc in bow_noun_jj]

model = TfidfModel(bow_noun_jj_vb)
tfidf_noun_jj_vb = [model[doc] for doc in bow_noun_jj_vb]

model = TfidfModel(bow_noun_rb)
tfidf_noun_rb = [model[doc] for doc in bow_noun_rb]

In [61]:
lsi_noun = get_lsi(tfidf_noun, dictn_noun)
lsi_noun_jj = get_lsi(tfidf_noun_jj, dictn_noun_jj)
lsi_noun_jj_vb = get_lsi(tfidf_noun_jj_vb, dictn_noun_jj_vb)
lsi_noun_rb = get_lsi(tfidf_noun_rb, dictn_noun_rb)

In [84]:
lda_noun = get_lda(bow_noun, dictn_noun, 'symmetric', None)
lda_noun_jj = get_lda(bow_noun_jj, dictn_noun_jj, 'symmetric', None)
lda_noun_jj_vb = get_lda(bow_noun_jj_vb, dictn_noun_jj_vb, 'symmetric', None)
lda_noun_rb = get_lda(bow_noun_rb, dictn_noun_rb, 'symmetric', None)

In [55]:
noun_ver = [get_bow_var(bow_noun, dictn_noun), get_tfidf_var(tfidf_noun, dictn_noun), lsi_noun, lda_noun]
noun_jj_ver = [get_bow_var(bow_noun_jj, dictn_noun_jj), get_tfidf_var(tfidf_noun_jj, dictn_noun_jj), lsi_noun_jj, lda_noun_jj]
noun_jj_vb_ver = [get_bow_var(bow_noun_jj_vb, dictn_noun_jj_vb), get_tfidf_var(tfidf_noun_jj_vb, dictn_noun_jj_vb), lsi_noun_jj_vb, lda_noun_jj_vb]
noun_rb_ver = [get_bow_var(bow_noun_rb, dictn_noun_rb), get_tfidf_var(tfidf_noun_rb, dictn_noun_rb), lsi_noun_rb, lda_noun_rb]

In [59]:
names = ["BoW", "TFIDF", "LSI", "LDA"]

for i in range(len(noun_ver)):
    get_res(f"NOUN-{names[i]}:", noun_ver[i])
print("--------------------------------")
for i in range(len(noun_jj_ver)):
    get_res(f"NOUN_JJ-{names[i]}:", noun_jj_ver[i])
print("--------------------------------")
for i in range(len(noun_jj_vb_ver)):
    get_res(f"NOUN_JJ_VB-{names[i]}:", noun_jj_vb_ver[i])
print("--------------------------------")
for i in range(len(noun_rb_ver)):
    get_res(f"NOUN_RB-{names[i]}:", noun_rb_ver[i])

NOUN-BoW:
    Random Forest:     0.886
    Gradient Boosting: 0.838
NOUN-TFIDF:
    Random Forest:     0.882
    Gradient Boosting: 0.839
NOUN-LSI:
    Random Forest:     0.924
    Gradient Boosting: 0.903
NOUN-LDA:
    Random Forest:     0.881
    Gradient Boosting: 0.873
--------------------------------
NOUN_JJ-BoW:
    Random Forest:     0.885
    Gradient Boosting: 0.85
NOUN_JJ-TFIDF:
    Random Forest:     0.884
    Gradient Boosting: 0.85
NOUN_JJ-LSI:
    Random Forest:     0.923
    Gradient Boosting: 0.908
NOUN_JJ-LDA:
    Random Forest:     0.863
    Gradient Boosting: 0.867
--------------------------------
NOUN_JJ_VB-BoW:
    Random Forest:     0.901
    Gradient Boosting: 0.861
NOUN_JJ_VB-TFIDF:
    Random Forest:     0.891
    Gradient Boosting: 0.861
NOUN_JJ_VB-LSI:
    Random Forest:     0.935
    Gradient Boosting: 0.921
NOUN_JJ_VB-LDA:
    Random Forest:     0.885
    Gradient Boosting: 0.868
--------------------------------
NOUN_RB-BoW:
    Random Forest:     0.897
   

In [89]:
alpha = [round(0.2 * i, 1) for i in range(1, 11)]
bbeta = [round(0.2 * i, 1) for i in range(1, 11)]

In [None]:
for val_a in alpha:
    for val_b in bbeta:
        test_lda_noun_jj_vb = get_lda(bow_noun_jj_vb, dictn_noun_jj_vb, val_a, val_b)
        get_res(f"NOUN_JJ_VB ({val_a}, {val_b}):", test_lda_noun_jj_vb)

NOUN_JJ_VB (0.2, 0.2):
    Random Forest:     0.893
    Gradient Boosting: 0.882
NOUN_JJ_VB (0.2, 0.4):
    Random Forest:     0.893
    Gradient Boosting: 0.884
NOUN_JJ_VB (0.2, 0.6):
    Random Forest:     0.888
    Gradient Boosting: 0.887
NOUN_JJ_VB (0.2, 0.8):
    Random Forest:     0.886
    Gradient Boosting: 0.887
NOUN_JJ_VB (0.2, 1.0):
    Random Forest:     0.874
    Gradient Boosting: 0.877
NOUN_JJ_VB (0.2, 1.2):
    Random Forest:     0.874
    Gradient Boosting: 0.868
NOUN_JJ_VB (0.2, 1.4):
    Random Forest:     0.901
    Gradient Boosting: 0.907
NOUN_JJ_VB (0.2, 1.6):
    Random Forest:     0.88
    Gradient Boosting: 0.895
NOUN_JJ_VB (0.2, 1.8):
    Random Forest:     0.89
    Gradient Boosting: 0.885
NOUN_JJ_VB (0.2, 2.0):
    Random Forest:     0.907
    Gradient Boosting: 0.898
NOUN_JJ_VB (0.4, 0.2):
    Random Forest:     0.879
    Gradient Boosting: 0.886
NOUN_JJ_VB (0.4, 0.4):
    Random Forest:     0.899
    Gradient Boosting: 0.884
NOUN_JJ_VB (0.4, 0.6):
    Ran