In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

sns.set_style('whitegrid')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/FarberE/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/FarberE/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/FarberE/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
df = pd.read_csv('../data/df_experiment', index_col=0)
df.head()

Unnamed: 0,entry_id,updated,published,title,summary,primary_category,categories,authors
0,http://arxiv.org/abs/2305.11154v1,2023.42,2023.42,non linear operator valued elliptic flows with...,differential equations on spaces of operators ...,ph,['mp'],"['jean bernard bru', 'nathan metraud']"
1,http://arxiv.org/abs/2305.11103v1,2023.42,2023.42,blockwise inversion and algorithms for inverti...,using the blockwise matrix inversion inversion...,na,"['na', 'na', 'mp']",['r thiru senthil']
2,http://arxiv.org/abs/2305.11054v1,2023.42,2023.42,ising systems measures on the sphere and zonoids,we give an interpretation of a class of discre...,ap,"['ap', 'mp', 'oc']","['andrea braides', 'antonin chambolle']"
3,http://arxiv.org/abs/2210.09458v2,2023.42,2022.83,mobility edge for levy matrices,levy matrices are symmetric random matrices wh...,pr,"['pr', 'mp']","['amol aggarwal', 'charles bordenave', 'patric..."
4,http://arxiv.org/abs/2205.08765v2,2023.42,2022.42,necessary and sufficient conditions for one di...,this paper deals with necessary and sufficient...,ca,"['ca', 'ft', 'mp']",['pavol quittner']


In [5]:
from collections import defaultdict

In [12]:
# create new column with tokens
df['tokens'] = df['summary'].apply(nltk.word_tokenize)
df.sample()

Unnamed: 0,entry_id,updated,published,title,summary,primary_category,categories,authors,tokens
902,http://arxiv.org/abs/2303.02294v1,2023.25,2023.25,reverse isoperimetric problems under curvature...,in this paper we solve several reverse isoperi...,mg,"['mg', 'dg']","['kostiantyn drach', 'kateryna tatarko']","[in, this, paper, we, solve, several, reverse,..."


In [13]:
df['tokens'].head()

0    [differential, equations, on, spaces, of, oper...
1    [using, the, blockwise, matrix, inversion, inv...
2    [we, give, an, interpretation, of, a, class, o...
3    [levy, matrices, are, symmetric, random, matri...
4    [this, paper, deals, with, necessary, and, suf...
Name: tokens, dtype: object

In [16]:
def stop_strip(lst):
    return [word for word in lst if word not in stopwords.words('english')]

In [18]:
len(stop_strip(df['tokens'].iloc[0]))

92

In [19]:
len(df['tokens'].iloc[0])

165

In [20]:
df['strip_tokens'] = df['tokens'].apply(stop_strip)
df[['tokens', 'strip_tokens']].head()

Unnamed: 0,tokens,strip_tokens
0,"[differential, equations, on, spaces, of, oper...","[differential, equations, spaces, operators, l..."
1,"[using, the, blockwise, matrix, inversion, inv...","[using, blockwise, matrix, inversion, inversio..."
2,"[we, give, an, interpretation, of, a, class, o...","[give, interpretation, class, discrete, contin..."
3,"[levy, matrices, are, symmetric, random, matri...","[levy, matrices, symmetric, random, matrices, ..."
4,"[this, paper, deals, with, necessary, and, suf...","[paper, deals, necessary, sufficient, conditio..."


In [21]:
from collections import defaultdict

In [22]:
word_counter = defaultdict(int)

for text in df['strip_tokens']:
    for word in text:
        word_counter[word]+=1

In [26]:
keys = [key for key in word_counter.keys()]

In [31]:
len(keys)

16010

In [35]:
# make function for bagging tokens
def bagger(text, keys):
    bag = np.zeros(len(keys))
    for word in text:
        bag[keys.index(word)]+=1
        #bag[np.where(keys==word)]+=1
    
    return bag

In [36]:
bagger(df['strip_tokens'].iloc[0], keys)

array([3., 3., 1., ..., 0., 0., 0.])

In [37]:
sum(bagger(df['strip_tokens'].iloc[0], keys=keys))

92.0

In [39]:
len(df['strip_tokens'].iloc[0])

92

In [40]:
df['bag_vec'] = df['strip_tokens'].apply(lambda x: bagger(x,keys))

In [44]:
big_bag = np.stack(df['bag_vec'])
big_bag

array([[3., 3., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 3., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 1., 1.]])

In [46]:
big_bag.shape

(4462, 16010)

## Summary

At this point, we have created our `bag of words`: i.e., an array where each row is a document (an abstract in our case), and each column corresponds to a token that occurs in some document. We can think of this array as our new dataframe on which to apply ML algorithms.

We can compute TF-IDF score for each document-term pair in this array:

1) TF = term frequency, i.e. the number of appearances of the term in the document divided by the length of the document.
2) IDF = inverse document frequency. This is the log of the ratio (total number of docs) / (number of docs containing the term)

TF-IDF is the product of these two scores.

In [None]:
tfidf = np.zeros(big_bag.shape)

In [47]:
tfs = big_bag/(np.sum(big_bag, axis=1).reshape(-1,1))
tfs

array([[0.0326087 , 0.0326087 , 0.01086957, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.11538462, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00847458, 0.        , ..., 0.00847458, 0.00847458,
        0.00847458]])

In [59]:
idfs = -np.log10(np.sum(1*(big_bag>0), axis=0)/big_bag.shape[0])

In [60]:
idfs

array([1.11805065, 0.92854382, 1.00706505, ..., 3.64952957, 3.64952957,
       3.64952957])

In [64]:
tfidf = tfs*idfs

In [65]:
tfidf

array([[0.03645817, 0.0302786 , 0.01094636, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.11619981, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00786902, 0.        , ..., 0.03092822, 0.03092822,
        0.03092822]])

At this point, we have converted our documents (abstracts) to vectors whose entries are TF-IDF scores relative to each term in our vocabulary. We can now apply dimension reduction before clustering, for example.