In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [2]:
import nltk
# nltk.download ("stopwords")
# nltk.download ("punkt")

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw = stopwords.words("english") + list(punctuation) # unimportant features
# sw

In [4]:
import os

# Extract features and targets on text then combine into a dataframe

In [5]:
dirs = "Q3"
datas = []
for filename in os.listdir(dirs):
    if filename.endswith('.txt'):
        name = os.path.join(dirs, filename)
        with open("Q3/arxiv_annotate10_7_1.txt", 'r') as file:
            data = file.read()
        lines = data.splitlines()
        for line in lines:
            data = {}
            if '###' in line:
                continue
            # splitting label and the feature
            splitted = line.split('\t')
            data['Teks'] = splitted[1]
            data['label'] = splitted[0]
            datas.append(data)

In [6]:
df = pd.DataFrame(datas)
df.head()

Unnamed: 0,Teks,label
0,The Minimum Description Length principle for o...,MISC
1,"If the underlying model class is discrete, the...",MISC
2,"For MDL, in general one can only have loss bou...",MISC
3,We show that this is even the case if the mode...,AIMX
4,We derive a new upper bound on the prediction ...,OWNX


In [7]:
df.label.value_counts() # check the number of target categories

MISC    4500
OWNX    1260
AIMX     180
CONT      90
Name: label, dtype: int64

# Dataset Splitting

In [8]:
X = df.Teks # feature
y = df.label # target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4824,), (1206,), (4824,), (1206,))

# Training

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp

In [10]:
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw)), # encode unlabeled sentence to a matrix
    # use word_tokenize to tokenize sentence using nltk
    ('algo', LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=42))
])


model = RandomizedSearchCV(pipeline, rsp.logreg_params, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.2min finished
  'stop_words.' % sorted(inconsistent))


{'algo__C': 0.17670169402947963, 'algo__fit_intercept': True}
1.0 1.0 1.0


# Save model

In [11]:
from jcopml.utils import save_model, load_model

In [12]:
save_model(model, "paper_sentences_classifier.pkl")

Model is pickled as model/paper_sentences_classifier.pkl


# Sanity check (test and predict with the model)

In [13]:
model = load_model("model/paper_sentences_classifier.pkl")

In [14]:
# DOI: 10.1109/MLBDBI51377.2020.00037
Text = "Diabetes is a common disease and its early symptoms are not very noticeable, so an efficient method of prediction will help patients make a self-diagnosis. However, the conventional method to identify diabetes is to make a blood glucose test by doctors and the medical resource is limited. Therefore, most patients cannot get the diagnosis immediately. Since the early symptoms of diabetes are not obvious and the relationship between symptoms and diabetes is complex, the self-diagnosis results based on patients' own experience are not accurate. The process of Machine Learning is to train a computational algorithm for prediction based on a big dataset. It is popular for its efficiency and accuracy. Also, it has the advantage of dealing with tons of data, so we can make diagnoses for plenty of patients in a short time and the result will be more accurate. In this study, we used six classical machine learning models, including logistic regression, support vector machine, decision tree, random forest, boosting and neural network, to make a prediction model for diabetes diagnosis. Our data was from UCI Machine Learning Repository, which was collected by direct questionnaires from the patients of the Sylhet Diabetes Hospital in Sylhet, Bangladesh and approved by a doctor. We conduct parameter tuning on each model to tradeoff between the accuracy and complexity. The testing error shows that random forest, boosting and neural network had better performances than logistic regression, support vector machine and decision tree. The accuracy of neural network of the test dataset achieves 96 percent, which is the best model among these models for predicting diabetes."
# DOI: 10.1109/CVIDL51233.2020.00-85
Text2 = "With the rapid development of Chinnes highway transportation industry, the problem of road traffic safety has become increasingly prominent. Highway passenger transport accidents are generally fatal and fatal accidents. Traffic accidents not only cause enormous economic losses to transport enterprises, but also have a very bad social impact on local highway transport management departments, which has even become a new social instability factor. This is mainly because there are many problems in autopilot technology, such as low recognition accuracy, poor real-time performance, weak anti-interference ability and so on. However, embedded technology and machine learning can solve these problems well, so autopilot technology will become the mainstream in the future. Firstly, this paper analyses the importance of autopilot technology. Then this paper analyses the machine learning target recognition, vehicle automatic driving system model and vehicle automatic driving system flow. Finally, this paper designs the function of autopilot system."

In [15]:
lines = Text.split('.')
model.predict(lines)

array(['MISC', 'MISC', 'MISC', 'OWNX', 'MISC', 'MISC', 'MISC', 'MISC',
       'MISC', 'MISC', 'OWNX', 'MISC', 'MISC'], dtype=object)

In [16]:
lines = Text.split('.')
for line in lines[:-1]:
    print(f'[{model.predict([line])[0]}, {max(model.predict_proba([line])[0])}]  {line}')
    print('-----------------------------------------------------------------------------------------------------')

[MISC, 0.8492262763052735]  Diabetes is a common disease and its early symptoms are not very noticeable, so an efficient method of prediction will help patients make a self-diagnosis
-----------------------------------------------------------------------------------------------------
[MISC, 0.7224034373201657]   However, the conventional method to identify diabetes is to make a blood glucose test by doctors and the medical resource is limited
-----------------------------------------------------------------------------------------------------
[MISC, 0.901659883027018]   Therefore, most patients cannot get the diagnosis immediately
-----------------------------------------------------------------------------------------------------
[OWNX, 0.4978701527672024]   Since the early symptoms of diabetes are not obvious and the relationship between symptoms and diabetes is complex, the self-diagnosis results based on patients' own experience are not accurate
------------------------------------

In [17]:
lines = Text2.split('.')
for line in lines[:-1]:
    print(f'[{model.predict([line])[0]}, {max(model.predict_proba([line])[0])}]  {line}')
    print('-----------------------------------------------------------------------------------------------------')

[MISC, 0.8535853390992194]  With the rapid development of Chinnes highway transportation industry, the problem of road traffic safety has become increasingly prominent
-----------------------------------------------------------------------------------------------------
[MISC, 0.8535853390992194]   Highway passenger transport accidents are generally fatal and fatal accidents
-----------------------------------------------------------------------------------------------------
[MISC, 0.8071723985039047]   Traffic accidents not only cause enormous economic losses to transport enterprises, but also have a very bad social impact on local highway transport management departments, which has even become a new social instability factor
-----------------------------------------------------------------------------------------------------
[MISC, 0.9187601577347553]   This is mainly because there are many problems in autopilot technology, such as low recognition accuracy, poor real-time performance,