Skip to content

Files

Latest commit

 

History

History

Data

Folders and files

NameName
Last commit message
Last commit date

parent directory

..
 
 
 
 
 
 

Text Classification Algorithm: A Brief Overview

Table of Contents

  • This dataset contains 50,000 documents with 2 categories.
import sys
import os
from RMDL import text_feature_extraction as txt
from keras.datasets import imdb
import numpy as np
from RMDL import RMDL_Text as RMDL
print("Load IMDB dataset....")
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=MAX_NB_WORDS)
print(len(X_train))
print(y_test)
word_index = imdb.get_word_index()
index_word = {v: k for k, v in word_index.items()}
X_train = [txt.text_cleaner(' '.join(index_word.get(w) for w in x)) for x in X_train]
X_test = [txt.text_cleaner(' '.join(index_word.get(w) for w in x)) for x in X_test]
X_train = np.array(X_train)
X_train = np.array(X_train).ravel()
print(X_train.shape)
X_test = np.array(X_test)
X_test = np.array(X_test).ravel()
  • Linke of dataset: |Data|
    • Web of Science Dataset WOS-11967
      • This dataset contains 11,967 documents with 35 categories which include 7 parents categories.
    • Web of Science Dataset WOS-46985
      • This dataset contains 46,985 documents with 134 categories which include 7 parents categories.
    • Web of Science Dataset WOS-5736
      • This dataset contains 5,736 documents with 11 categories which include 3 parents categories.
from RMDL import text_feature_extraction as txt
from sklearn.model_selection import train_test_split
from RMDL.Download import Download_WOS as WOS
import numpy as np
from RMDL import RMDL_Text as RMDL
path_WOS = WOS.download_and_extract()
fname = os.path.join(path_WOS,"WebOfScience/WOS11967/X.txt")
fnamek = os.path.join(path_WOS,"WebOfScience/WOS11967/Y.txt")
with open(fname, encoding="utf-8") as f:
    content = f.readlines()
    content = [txt.text_cleaner(x) for x in content]
with open(fnamek) as fk:
    contentk = fk.readlines()
contentk = [x.strip() for x in contentk]
Label = np.matrix(contentk, dtype=int)
Label = np.transpose(Label)
np.random.seed(7)
print(Label.shape)
X_train, X_test, y_train, y_test = train_test_split(content, Label, test_size=0.2, random_state=4)
  • This dataset contains 21,578 documents with 90 categories.
import sys
import os
import nltk
nltk.download("reuters")
from nltk.corpus import reuters
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from RMDL import RMDL_Text as RMDL
     documents = reuters.fileids()

     train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                               documents))
     test_docs_id = list(filter(lambda doc: doc.startswith("test"),
                              documents))
     X_train = [(reuters.raw(doc_id)) for doc_id in train_docs_id]
     X_test = [(reuters.raw(doc_id)) for doc_id in test_docs_id]
     mlb = MultiLabelBinarizer()
     y_train = mlb.fit_transform([reuters.categories(doc_id)
                                for doc_id in train_docs_id])
     y_test = mlb.transform([reuters.categories(doc_id)
                           for doc_id in test_docs_id])
     y_train = np.argmax(y_train, axis=1)
     y_test = np.argmax(y_test, axis=1)




==========

@ARTICLE{Kowsari2018Text_Classification,
    title={Text Classification Algorithms: A Survey},
    author={Kowsari, Kamran and Jafari Meimandi, Kiana and Heidarysafa, Mojtaba and Mendu, Sanjana and Barnes, Laura E. and Brown, Donald E.},
    journal={Information},
    year={2019},
    VOLUME = {10},
    YEAR = {2019},
    NUMBER = {4},
    ARTICLE-NUMBER = {150},
    URL = {http://www.mdpi.com/2078-2489/10/4/150},
    ISSN = {2078-2489},
    publisher={Multidisciplinary Digital Publishing Institute}
}