In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
from tqdm import tqdm
import gc

In [None]:
!pip install ktrain

In [None]:
import json
data  = []
with open("../input/arxiv/arxiv-metadata-oai-snapshot.json", 'r') as f:
    for line in f: 
        if len(data) > 200000:
            break
        data.append(json.loads(line))

In [None]:
df = {'id': [], 'text': [], 'categories': [], 'authors': []}
for paper in tqdm(data):
    df["id"].append(paper["id"])
    df['text'].append(paper['title'] + paper['abstract'])
    df['authors'].append(paper['authors_parsed'])
    df['categories'].append(paper['categories'].split())

In [None]:
del data
gc.collect()

In [None]:
df = pd.DataFrame(df, columns=['id', 'text', 'categories'])
df = df.sample(50000)
df.head(20)

In [None]:
from collections import Counter

threshold = 60

filter = []
arr_category = []
for i in df["categories"]:
    arr_category.extend(i)

arr_category = Counter(arr_category)
for i in arr_category:
    if arr_category[i] < threshold:
        filter.append(i)

In [None]:
df = df.dropna()
df.reset_index(drop=True, inplace=True)

for ind, i in tqdm(enumerate(df["categories"])):
    rem = []
    for cat in i:
        if cat in filter:
            rem.append(cat)
    for j in rem:
        df["categories"][ind].remove(j)

    if df["categories"][ind] != None and len(df["categories"][ind]) > 0:
        df["categories"][ind] = df["categories"][ind][0]
    elif df["categories"][ind] != None and len(df["categories"][ind]) == 0:
        df["categories"][ind] = None
    
df = df.dropna()
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
df.drop(["id"], axis = 1, inplace = True)
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)

In [None]:
import ktrain
from ktrain import text
(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=train_df,
                                                                   text_column = 'text',
                                                                   label_columns = 'categories',
                                                                   val_df = test_df,
                                                                   maxlen = 250,
                                                                   preprocess_mode = 'bert')

In [None]:
model = text.text_classifier(name = 'bert',
                             train_data = (X_train, y_train),
                             preproc = preproc)

In [None]:
learner = ktrain.get_learner(model=model, train_data=(X_train, y_train),
                   val_data = (X_test, y_test),
                   batch_size = 16)

In [None]:
learner.lr_find(show_plot=True, max_epochs=2)

In [None]:
learner.fit_onecycle(lr = 5e-5, epochs = 1)

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [None]:
data1 = "The success of machine learning in a broad range of applications has led to an \
ever-growing demand for machine learning systems that can be used off the shelf \
by non-experts. To be effective in practice, such systems need to automatically \
choose a good algorithm and feature preprocessing steps for a new dataset at hand, \
and also set their respective hyperparameters. Recent work has started to tackle this \
automated machine learning (AutoML) problem with the help of efficient Bayesian \
optimization methods. Building on this, we introduce a robust new AutoML system \
based on scikit-learn (using 15 classifiers, 14 feature preprocessing methods, and \
4 data preprocessing methods, giving rise to a structured hypothesis space with \
110 hyperparameters). This system, which we dub AUTO-SKLEARN, improves on \
existing AutoML methods by automatically taking into account past performance \
on similar datasets, and by constructing ensembles from the models evaluated \
during the optimization. Our system won the first phase of the ongoing ChaLearn \
AutoML challenge, and our comprehensive analysis on over 100 diverse datasets \
shows that it substantially outperforms the previous state of the art in AutoML. We \
also demonstrate the performance gains due to each of our contributions and derive \
insights into the effectiveness of the individual components of AUTO-SKLEARN"

data2 = "Yeasts exist in communities that expand over space and time to form complex structures and patterns.\
We developed a lattice-based framework to perform spatial-temporal Monte Carlo simulations of budding yeast\
colonies exposed to different nutrient and magnetic field conditions.\
The budding patterns of haploid and diploid yeast cells were incorporated into the framework, as well as the \
filamentous growth that occurs in yeast colonies under nutrient limiting conditions. Simulation of the framework \
predicted that magnetic fields decrease colony growth rate, solidity, and roundness. Magnetic field simulations further \
predicted that colony elongation and boundary fluctuations increase in a nutrient- and ploidy-dependent manner. \
These in-silico predictions are an important step towards understanding the effects of the physico-chemical environment on \
microbial colonies and for informing bioelectromagnetic experiments on yeast colony biofilms and fungal pathogens."

In [None]:
predictor.predict(data2)

In [None]:
predictor.save('/kaggle/working/bert')
!ls bert/
FileLink(r'bert/tf_model.preproc')