Text Classification Algorithm: A Brief Overview
This dataset contains 50,000 documents with 2 categories.
import sys
import os
from RMDL import text_feature_extraction as txt
from keras .datasets import imdb
import numpy as np
from RMDL import RMDL_Text as RMDL
print ("Load IMDB dataset...." )
(X_train , y_train ), (X_test , y_test ) = imdb .load_data (num_words = MAX_NB_WORDS )
print (len (X_train ))
print (y_test )
word_index = imdb .get_word_index ()
index_word = {v : k for k , v in word_index .items ()}
X_train = [txt .text_cleaner (' ' .join (index_word .get (w ) for w in x )) for x in X_train ]
X_test = [txt .text_cleaner (' ' .join (index_word .get (w ) for w in x )) for x in X_test ]
X_train = np .array (X_train )
X_train = np .array (X_train ).ravel ()
print (X_train .shape )
X_test = np .array (X_test )
X_test = np .array (X_test ).ravel ()
Linke of dataset: |Data|
Web of Science Dataset
WOS-11967
This dataset contains 11,967 documents with 35 categories which
include 7 parents categories.
Web of Science Dataset
WOS-46985
This dataset contains 46,985 documents with 134 categories
which include 7 parents categories.
Web of Science Dataset
WOS-5736
This dataset contains 5,736 documents with 11 categories which
include 3 parents categories.
from RMDL import text_feature_extraction as txt
from sklearn .model_selection import train_test_split
from RMDL .Download import Download_WOS as WOS
import numpy as np
from RMDL import RMDL_Text as RMDL
path_WOS = WOS .download_and_extract ()
fname = os .path .join (path_WOS ,"WebOfScience/WOS11967/X.txt" )
fnamek = os .path .join (path_WOS ,"WebOfScience/WOS11967/Y.txt" )
with open (fname , encoding = "utf-8" ) as f :
content = f .readlines ()
content = [txt .text_cleaner (x ) for x in content ]
with open (fnamek ) as fk :
contentk = fk .readlines ()
contentk = [x .strip () for x in contentk ]
Label = np .matrix (contentk , dtype = int )
Label = np .transpose (Label )
np .random .seed (7 )
print (Label .shape )
X_train , X_test , y_train , y_test = train_test_split (content , Label , test_size = 0.2 , random_state = 4 )
This dataset contains 21,578 documents with 90 categories.
import sys
import os
import nltk
nltk .download ("reuters" )
from nltk .corpus import reuters
from sklearn .preprocessing import MultiLabelBinarizer
import numpy as np
from RMDL import RMDL_Text as RMDL
documents = reuters .fileids ()
train_docs_id = list (filter (lambda doc : doc .startswith ("train" ),
documents ))
test_docs_id = list (filter (lambda doc : doc .startswith ("test" ),
documents ))
X_train = [(reuters .raw (doc_id )) for doc_id in train_docs_id ]
X_test = [(reuters .raw (doc_id )) for doc_id in test_docs_id ]
mlb = MultiLabelBinarizer ()
y_train = mlb .fit_transform ([reuters .categories (doc_id )
for doc_id in train_docs_id ])
y_test = mlb .transform ([reuters .categories (doc_id )
for doc_id in test_docs_id ])
y_train = np .argmax (y_train , axis = 1 )
y_test = np .argmax (y_test , axis = 1 )
== == == == ==
@ARTICLE{Kowsari2018Text_Classification,
title={Text Classification Algorithms: A Survey},
author={Kowsari, Kamran and Jafari Meimandi, Kiana and Heidarysafa, Mojtaba and Mendu, Sanjana and Barnes, Laura E. and Brown, Donald E.},
journal={Information},
year={2019},
VOLUME = {10},
YEAR = {2019},
NUMBER = {4},
ARTICLE-NUMBER = {150},
URL = {http://www.mdpi.com/2078-2489/10/4/150},
ISSN = {2078-2489},
publisher={Multidisciplinary Digital Publishing Institute}
}