jel: https://www.aeaweb.org/econlit/jelCodes.php?view=jel

ref (general):
- https://scikit-learn.org/stable/modules/multiclass.html#multilabel-classification
- https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/
- https://towardsdatascience.com/journey-to-the-center-of-multi-label-classification-384c40229bff
- https://keras.io/examples/nlp/multi_label_classification/

ref (few-shot):
- https://maelfabien.github.io/machinelearning/NLP_5/
- https://few-shot-text-classification.fastforwardlabs.com/
- https://research.aimultiple.com/few-shot-learning/

**Note**:
- use paper database to do multi-label classification does not work as I thought
  - the problem is that too many lables and for some lables there are only very limited number of papers, which creating large imbalanced data problem
  - this problem is not solved by using sample adjusting like SMOTE, and only partly solved by using class_weight = "balanced": not very good prediction: in test sample, some true label is missing, I feel that > 50% correctness finding in top20-30
  
**TO-DO:**
  - Add more data 
  - Try adding penality in L2 logistic
  - Try SVM
  - Use the information of jel to do alternative match 
  - Combine two methods to increase prediction power

## Data Prepare

In [1]:
df = pd.read_csv("/Users/alalalalaki/GitHub/Econ-Paper-Search/Data/papers.csv")
df.shape

(159068, 9)

In [2]:
## borrow from Econ-Paper-Search
# drop book reviews (not perfect)
masks = [~df.title.str.contains(i, case=False, regex=False) for i in ["pp.", " p."]]  # "pages," " pp "
mask = np.vstack(masks).all(axis=0)
df = df.loc[mask]
# drop some duplicates due to weird strings in authors and abstract
df = df[~df.duplicated(['title', 'url']) | df.url.isna()]

In [3]:
mask = ~df.jel.isna()

In [4]:
mask2 = df.year > 2000

In [5]:
mask3 = ~df.abstract.isna()

In [6]:
df = df[mask & mask2 & mask3]
df.shape

(44880, 9)

In [7]:
jel_dummy_matrix = df.jel.str.get_dummies(sep="&")

# use only 1digit
# jel_dummy_matrix = (df.jel.str.replace("(?<=[A-Z]\d)\d","", regex=True)
#                           .str.get_dummies(sep="&"))

In [None]:
# how many times a jel code is used in the dataset
# jel_dummy_matrix.sum(axis=0).plot.hist(bins=1000)
jel_dummy_matrix.sum(axis=0).value_counts().sort_index()

# how many jel codes in each paper
#df.jel.str.count("&").plot.hist(bins=25)

In [11]:
# remove not 2-digit jel
jel_dummy_matrix = jel_dummy_matrix.loc[:,~(jel_dummy_matrix.columns.str.len()<3)]

In [15]:
# remove label with too less entry
label_idx_minor = jel_dummy_matrix.sum(axis=0) <= 1 
label_idx_minor.sum()
jel_dummy_matrix = jel_dummy_matrix.loc[:,~label_idx_minor]

27

In [16]:
jel_labels = jel_dummy_matrix.columns.values

In [17]:
# further remove papers with no jel now
mask = jel_dummy_matrix.sum(axis=1) == 0
jel_dummy_matrix = jel_dummy_matrix.loc[~mask,:]
df = df.loc[~mask,:]

In [18]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


#stop_words = set(stopwords.words('english'))
#stop_words.update(['zero','one','two','three','four','five','six','seven',
# 'eight','nine','ten','may','also','across','among','beside','however','yet','within'])

stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

abstract = df.abstract.apply(stemming)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [20]:
tfidf = TfidfVectorizer(stop_words='english',min_df=2,
                        # strip_accents='unicode',
                        # ngram_range=(1,2), norm='l2',
                       )
token_matrix = tfidf.fit_transform(abstract)
token_matrix

<42052x26278 sparse matrix of type '<class 'numpy.float64'>'
	with 2382098 stored elements in Compressed Sparse Row format>

In [50]:
tokens = tfidf.get_feature_names_out()

In [384]:
from sklearn.model_selection import train_test_split

y_train, y_test, X_train, X_test = train_test_split(jel_dummy_matrix, token_matrix, 
                               random_state=46,
                               test_size=0.005, shuffle=True)

## Try Supervised Algorithms

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [None]:
def check_test_random(classifier, proba=False):
    rand_n = np.random.choice(X_test.shape[0])

    X_test_rand = X_test[rand_n,:]

    y_test_rand = y_test.iloc[rand_n,:] 
    print("Real JELs: ", y_test_rand[y_test_rand > 0])
    
    if not proba:
        predict_rand = classifier.predict(X_test_rand) 
        print("Predict JELs: ", predict_rand[predict_rand > 0])
    else:
        predict_proba_rand = classifier.predict_proba(X_test_rand) 
        print("Predict JELs: ", predict_rand[predict_rand > 0])
    

In [360]:
def custom_metric(classifer, X_test, y_test, top_n=10):
    find_rates = []
    for i in tqdm(range(len(y_test))):
        X_test_ = X_test[i,:]
        y_test_ = y_test.iloc[i,:]
        y_test_jel = y_test_[y_test_ > 0].index.values

        predict_proba_ = classifier.predict_proba(X_test_) 
        predict_proba_ = np.array([a[0][1] for a in predict_proba_])
        jel_rank = np.argsort(predict_proba_)[::-1]
        if top_n <=1:
            top_n = (predict_proba_ > top_n).sum()
        y_predict_jel = jel_labels[jel_rank][:top_n]
        y_predict_jel_proba = predict_proba_[jel_rank][:top_n]

        base = len(y_test_jel)
        correct_find = len(set(y_predict_jel) & set(y_test_jel))
        find_rate = correct_find/base
        find_rates.append(find_rate)
    avg_find_rate = np.array(find_rates).mean()
    return avg_find_rate

### ~~OneVsRest~~

-  An intuitive approach to solving multi-label problem is to decompose it into multiple independent binary classification problems (one per category).
- In an “one-to-rest” strategy, one could build multiple independent classifiers and, for an unseen instance, choose the class for which the confidence is maximized.
- The main assumption here is that the labels are mutually exclusive. You do not consider any underlying correlation between the classes in this method. For instance, it is more like asking simple questions, say, “is the comment toxic or not”, “is the comment threatening or not?”, etc. Also there might be an extensive case of overfitting here, since most of the comments are unlabeled, i,e., most of the comments are clean comments.
- @not work, simply predict all 0

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [None]:
classifer = OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)

In [None]:
classifers = []
# predictions = []
for jel in tqdm(jel_dummy_matrix.columns):    
    # Training logistic regression model on train data
    y_train_ = y_train[jel]
    classifer.fit(X_train, y_train_,)
    classifers.append(LogReg_pipeline)
    # ~~calculating test accuracy~~ this makes no sense for imbalanced data
    # prediction = LogReg_pipeline.predict(X_test)
    # predictions.append(prediction)
    # print('Test accuracy is {}'.format(accuracy_score(y_test[jel], prediction)))

In [None]:
rand_n = np.random.choice(X_test.shape[0])

X_test_rand = X_test[rand_n,:]

y_test_rand = y_test.iloc[rand_n,:] 
y_test_rand[y_test_rand > 0]

In [None]:
preds = [c.predict(X_test_rand) for c in classifers]

In [None]:
np.sum(preds)

### ~~Binary Relevance~~

- In this case an ensemble of single-label binary classifiers is trained, one for each class. Each classifier predicts either the membership or the non-membership of one class. The union of all classes that were predicted is taken as the multi-label output. This approach is popular because it is easy to implement, however it also ignores the possible correlations between class labels.
- In other words, if there’s q labels, the binary relevance method create q new data sets from the images, one for each label and train single-label classifiers on each new data set. One classifier may answer yes/no to the question “does it contain trees?”, thus the “binary” in “binary relevance”. This is a simple approach but does not work well when there’s dependencies between the labels.
- OneVsRest & Binary Relevance seem very much alike. If multiple classifiers in OneVsRest answer “yes” then you are back to the binary relevance scenario.
- @Never ends

In [None]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

In [None]:
classifier = BinaryRelevance(GaussianNB())

In [None]:
classifier.fit(X_train, y_train)

In [None]:
predictions = classifier.predict(X_test)
print("Accuracy = ",accuracy_score(y_test,predictions))

### RandomForest

#### MultiOutputClassifier

- Multilabel classification support can be added to any classifier with MultiOutputClassifier.  This strategy consists of fitting one classifier per target. This allows multiple target variable classifications. 
- The purpose of this class is to extend estimators to be able to estimate a series of target functions (f1,f2,f3…,fn) that are trained on a single X predictor matrix to predict a series of responses (y1,y2,y3…,yn).
- @again can only predict 0 

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
forest = RandomForestClassifier(random_state=46)
classifier = MultiOutputClassifier(forest, n_jobs=-1)

In [None]:
%%time
classifier.fit(X_train, y_train)

In [None]:
predictions = classifier.predict(X_test)
print("Accuracy = ",accuracy_score(y_test,predictions))

#### BinaryRelevance

In [None]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = BinaryRelevance(
    classifier = RandomForestClassifier(),
    require_dense = [False, True],
)

In [None]:
%%time
classifier.fit(X_train, y_train)

In [None]:
check_test_random(classifier)

### Classifier Chains

- A chain of binary classifiers C0, C1, . . . , Cn is constructed, where a classifier Ci uses the predictions of all the classifier Cj , where j < i. This way the method, also called classifier chains (CC), can take into account label correlations.
- The total number of classifiers needed for this approach is equal to the number of classes, but the training of the classifiers is more involved.

In [None]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

In [None]:
classifier = ClassifierChain(LogisticRegression())

In [None]:
%%time
classifier.fit(X_train, y_train)

### ~~Label Powerset~~

- This approach does take possible correlations between class labels into account. It considers each member of the power set of labels in the training set as a single label.
- This method needs worst case (2^|C|) classifiers, and has a high computational complexity.
- However *when the number of classes increases the number of distinct label combinations can grow exponentially. This easily leads to combinatorial explosion and thus computational infeasibility.* Furthermore, some label combinations will have very few positive examples.

In [None]:
from skmultilearn.problem_transform import LabelPowerset

In [None]:
classifier = LabelPowerset(LogisticRegression())
classifier.fit(X_train, y_train)

In [None]:
predictions = classifier.predict(X_test)
print("Accuracy = ",accuracy_score(y_test,predictions))

### ~~Adapted Algorithm~~

- Algorithm adaptation methods for multi-label classification concentrate on adapting single-label classification algorithms to the multi-label case usually by changes in cost/decision functions.
- Here we use a multi-label lazy learning approach named ML-KNN which is derived from the traditional K-nearest neighbor (KNN) algorithm.
- The skmultilearn.adapt module implements algorithm adaptation approaches to multi-label classification, including but not limited to ML-KNN.
- *Both ML-KNN and label-powerset take considerable amount of time* when run on this dataset, so experimentation was done on a random sample of the train data.

In [None]:
from skmultilearn.adapt import MLkNN

# Note that this classifier can throw up errors when handling sparse matrices.

### Solve Imbalanced Data

#### MLSMOTE

https://medium.com/thecyphy/handling-data-imbalance-in-multi-label-classification-mlsmote-531155416b87

- @code only works for pandas df, and it seems does not fit well to very large amount of labels

In [None]:
from sklearn.neighbors import NearestNeighbors
import random

In [None]:
def get_tail_label(df):
    """
    Give tail label colums of the given target dataframe
    
    args
    df: pandas.DataFrame, target label df whose tail label has to identified
    
    return
    tail_label: list, a list containing column name of all the tail label
    """
    columns = df.columns
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

def get_index(df):
  """
  give the index of all tail_label rows
  args
  df: pandas.DataFrame, target label df from which index for tail label has to identified
    
  return
  index: list, a list containing index number of all the tail label
  """
  tail_labels = get_tail_label(df)
  index = set()
  for tail_label in tail_labels:
    sub_index = set(df[df[tail_label]==1].index)
    index = index.union(sub_index)
  return list(index)

def get_minority_instace(X, y):
    """
    Give minority dataframe containing all the tail labels
    
    args
    X: pandas.DataFrame, the feature vector dataframe
    y: pandas.DataFrame, the target vector dataframe
    
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    index = get_index(y)
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X):
    """
    Give index of 5 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

def MLSMOTE(X,y, n_sample):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>2 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbour,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    new_X = pd.concat([X, new_X], axis=0)
    target = pd.concat([y, target], axis=0)
    return new_X, target

In [None]:
index = get_index(y_train)

In [None]:
X_sub, y_sub = get_minority_instace(X_train, y_train)   #Getting minority instance of that datframe

In [None]:
X_res,y_res =MLSMOTE(X_sub, y_sub, 100)     #Applying MLSMOTE to augment the dataframe

#### SMOTE

- @somehow doesnot help at all, I guess this's about overfitting?

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
oversample = SMOTE(k_neighbors=2, random_state=46)
classifer = LogisticRegression(solver='sag', max_iter=500)

In [None]:
classifers = []
# predictions = []
for jel in tqdm(jel_dummy_matrix.columns):    
    # Training logistic regression model on train data
    y_train_ = y_train[jel]
    X_train_, y_train_ = oversample.fit_resample(X_train, y_train_)
    
    t = classifer.fit(X_train_, y_train_,)
    classifers.append(classifer)
    # ~~calculating test accuracy~~ this makes no sense for imbalanced data
    # prediction = LogReg_pipeline.predict(X_test)
    # predictions.append(prediction)
    # print('Test accuracy is {}'.format(accuracy_score(y_test[jel], prediction)))

In [None]:
rand_n = np.random.choice(X_test.shape[0])
X_test_rand = X_test[rand_n,:]
y_test_rand = y_test.iloc[rand_n,:] 
print("Real JELs: ", y_test_rand[y_test_rand > 0])

predict_rand = np.array([c.predict(X_test_rand) for c in classifers]).flatten() 
predict_rand = np.array([c.predict_proba(X_test_rand)[:,1] for c in classifers]).flatten() 
print("Predict JELs: ", predict_rand[predict_rand > 0])

#### class_weight

- @Still not work: predict many 1 and not accurate at all
- @This problem even remains when only checking 1digit (not full 2digit), now it's quite accurate that you can find the true jel code in to top 20 predictions, but ...
- @Using penality="l1" many not-converge and thus not high prediction

In [22]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [23]:
classifiers = {}

In [395]:
lr = LogisticRegression(solver='saga', max_iter=500, C=.1,
                        class_weight="balanced", penalty="l1", # "elasticnet"
                       )
classifier = MultiOutputClassifier(lr, n_jobs=-1)
classifiers["lr_l1_C1"] = classifier

In [386]:
svc = SVC(class_weight='balanced', # probability=True, 
          C=1, kernel='rbf' # 'sigmoid', 'linear', 'poly' 
         )
classifier = MultiOutputClassifier(svc, n_jobs=-1)

In [None]:
%%time
classifier.fit(X_train, y_train)

In [385]:
custom_metric(classifier, X_test, y_test, top_n=10)
custom_metric(classifier, X_test, y_test, top_n=20)
custom_metric(classifier, X_test, y_test, top_n=25)

custom_metric(classifier, X_test, y_test, top_n=.5)
custom_metric(classifier, X_test, y_test, top_n=.3)

100%|█████████████████████████████████████████| 211/211 [00:14<00:00, 14.63it/s]


0.542349356804333

100%|█████████████████████████████████████████| 211/211 [00:16<00:00, 13.17it/s]


0.7126269465132025

100%|█████████████████████████████████████████| 211/211 [00:14<00:00, 14.70it/s]


0.7381855111712932

100%|█████████████████████████████████████████| 211/211 [00:14<00:00, 14.37it/s]


0.626325885804559

100%|█████████████████████████████████████████| 211/211 [00:19<00:00, 11.10it/s]


0.6557887610020311

In [None]:
rand_n = np.random.choice(X_test.shape[0])
X_test_rand = X_test[rand_n,:]
y_test_rand = y_test.iloc[rand_n,:] 
print("Real JELs: ", y_test_rand[y_test_rand > 0])

In [None]:
predict_proba_rand = classifier.predict_proba(X_test_rand) 
predict_proba_rand = np.array([a[0][1] for a in predict_proba_rand])

In [None]:
jel_rank = np.argsort(predict_proba_rand)[::-1]

In [374]:
top_n = (predict_proba_rand > 0.5).sum()
jel_labels[jel_rank][:top_n]
predict_proba_rand[jel_rank][:top_n]

array(['O25', 'D25', 'E21', 'B51', 'E22', 'F40', 'Q22', 'E25', 'F43',
       'E01', 'B41', 'D14', 'H30', 'Q51', 'E24'], dtype=object)

array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.99999991,
       0.99999666, 0.99999572, 0.99614458, 0.96969067, 0.81578492])

In [375]:
top_n = 20
jel_labels[jel_rank][:top_n]
predict_proba_rand[jel_rank][:top_n]

array(['O25', 'D25', 'E21', 'B51', 'E22', 'F40', 'Q22', 'E25', 'F43',
       'E01', 'B41', 'D14', 'H30', 'Q51', 'E24', 'O18', 'O41', 'J24',
       'D24', 'H23'], dtype=object)

array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.99999991,
       0.99999666, 0.99999572, 0.99614458, 0.96969067, 0.81578492,
       0.38942712, 0.27405326, 0.17228893, 0.04896066, 0.00566725])

### ???

https://keras.io/examples/nlp/multi_label_classification/

In [None]:
from tensorflow.keras import layers
from tensorflow import keras
# import tensorflow as tf

In [None]:
def make_model():
    shallow_mlp_model = keras.Sequential(
        [
            layers.Dense(512, activation="relu"),
            layers.Dense(256, activation="relu"),
            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
        ]  # More on why "sigmoid" has been used here in a moment.
    )
    return shallow_mlp_model

In [None]:
epochs = 20

shallow_mlp_model = make_model()
shallow_mlp_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy"]
)

history = shallow_mlp_model.fit(
    train_dataset, validation_data=validation_dataset, epochs=epochs
)


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(abstract)
sequences = tokenizer.texts_to_sequences(abstract) # transforms the words in numbers
X = pad_sequences(sequences, maxlen=200) # ensures all the vectors have the same length

## Try Unsupervised Algorithms

- @ not extremely bad, but far away from useful

In [44]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [62]:
from joblib import Parallel, delayed

def tokenize_w(s):
    words = word_tokenize(s)
    words = [w for w in words if w in tokens]
    return words
    
def tokenize_s(p):
    sentences = sent_tokenize(p) 
    sentences = [tokenize_w(s) for s in sentences]
    return sentences
    
corpus_ = Parallel(n_jobs=-1)(delayed(tokenize_s)(abs.lower()) for abs in tqdm(df.abstract))
corpus = [s for p in corpus_ for s in p]

100%|█████████████████████████████████████| 42052/42052 [14:04<00:00, 49.82it/s]


In [60]:
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=corpus,
    vector_size=300, 
    window=5,
    min_count=2,
    workers=7,
    sg=0,  # 1 for skip-gram; otherwise CBOW.
    hs=0,  # 1 for hierarchical softmax used for model training; 0 for non-zero, negative sampling
    negative=5,  # specifies how many “noise words” drawn (usually between 5-20) in negative sampling, default 5
    ns_exponent=0.75,  # exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion to the frequencies, 0.0 samples all words equally, popular default value of 0.75
    cbow_mean=1,  # If 0, use the sum of the context word vectors. If 1, use the mean
)

wv = model.wv #.save(f"../Output/word2vec_v{vector_size}_w{window}_occ{i}.wordvectors")

In [155]:
corpus_p = [np.concatenate(p) for p in corpus_]

In [169]:
corpus_p_ = []
for p in tqdm(corpus_p):
    p = [w for w in p if w in wv.index_to_key]
    corpus_p_.append(p)

100%|████████████████████████████████████| 42052/42052 [01:18<00:00, 534.52it/s]


In [311]:
latent_p = [np.mean(wv[p], axis=0) for p in corpus_p_]

In [83]:
import requests
from xml.etree import ElementTree

response = requests.get("https://www.aeaweb.org/econlit/classificationTree.xml")
tree = ElementTree.fromstring(response.content)

In [98]:
from collections import defaultdict

def etree_to_dict(t):
    d = {t.tag: {} if t.attrib else None}
    children = list(t)
    if children:
        dd = defaultdict(list)
        for dc in map(etree_to_dict, children):
            for k, v in dc.items():
                dd[k].append(v)
        d = {t.tag: {k: v[0] if len(v) == 1 else v
                     for k, v in dd.items()}}
    if t.attrib:
        d[t.tag].update(('@' + k, v)
                        for k, v in t.attrib.items())
    if t.text:
        text = t.text.strip()
        if children or t.attrib:
            if text:
                d[t.tag]['#text'] = text
        else:
            d[t.tag] = text
    return d

In [99]:
d = etree_to_dict(tree)

In [142]:
d3 = [[d2['classification'],d2["description"]] for d1 in d['data']['classification'] for d2 in d1['classification']] 

In [145]:
d3 = [[d_["code"],d2des+" @ "+d_["description"]] for d__, d2des in d3 for d_ in d__ if isinstance(d_, dict)]

In [176]:
jel_official = pd.DataFrame(d3, columns=["jel","des"])

In [182]:
des = jel_official.des.apply(stemming)

In [183]:
corpus_o = [tokenize_w(s.lower()) for s in des]

In [185]:
corpus_o_ = []
for p in tqdm(corpus_o):
    p = [w for w in p if w in wv.index_to_key]
    corpus_o_.append(p)

100%|████████████████████████████████████████| 839/839 [00:01<00:00, 545.33it/s]


In [189]:
jel_official["corpus"] = corpus_o_

In [208]:
mask = jel_official.corpus.apply(lambda x: len(x)==0)

In [211]:
jel_official = jel_official[~mask].reset_index(drop=True)

In [310]:
latent_o = [np.mean(wv[p], axis=0) for p in jel_official.corpus]

In [306]:
from scipy import spatial

In [351]:
rand_n = np.random.choice(df.shape[0])

In [352]:
similarities = [1 - spatial.distance.cosine(latent_p[rand_n], lo) for lo in latent_o]
jel_rank = np.argsort(similarities)[::-1]
jel_official.jel[jel_rank].values[:20]

array(['J11', 'C13', 'C14', 'C15', 'C53', 'M52', 'C12', 'C19', 'C02',
       'C43', 'C40', 'C11', 'C49', 'C18', 'C45', 'C46', 'C41', 'C10',
       'C26', 'C36'], dtype=object)

In [353]:
y_jel = jel_dummy_matrix.iloc[rand_n,:] 
y_jel[y_jel > 0]

I20    1
I21    1
J24    1
Name: 153726, dtype: int64