This notebook is designed to preprocess the text collection of abstracts and the taxonomy of Data Science.

In [241]:
from libs import *
np.set_printoptions(suppress=True)

### Preprocessing papers

In [239]:
abstracts = []
abstracts_path = 'input_data/text_collections/Springer_abstracts'
MIN_SYMBOLS = 100  # the min necessary amount of characters in the abstract
for file_name in os.listdir(abstracts_path):
    with open(os.path.join(abstracts_path, file_name), 'r') as f:
        f_abstract = ' '.join(f.readlines()).replace('Abstract ', '')
        
        if len(f_abstract) > MIN_SYMBOLS:
            abstracts.append(f_abstract)
papers_df = pd.DataFrame(abstracts, columns=['abstract']) 

print(f'N abstracts :: {len(os.listdir(abstracts_path))}')
print(f'N abstracts with more then {MIN_SYMBOLS} symbols :: {len(abstracts)}')

N abstracts :: 17685
N abstracts with more then 100 symbols :: 17668


In [240]:
# saving proccesed papers
papers_df.to_csv('input_data/text_collections/papers_df.csv')

In [4]:
papers_df.head(2)

Unnamed: 0,abstract
0,"In this paper, we propose an innovative end-to..."
1,Some forms of mild cognitive impairment (MCI) ...


#### Enhanced abstracts preprocessing (new scrapped)

In [5]:
abstracts_path = 'input_data/text_collections/springer_papers_ENHANCED.json'
with open(abstracts_path, 'r') as f:
    papers_list_raw = json.load(f)
papers_list = []
for paper in papers_list_raw:
    if len(paper['abstract']) > MIN_SYMBOLS:
        papers_list.append(paper)
papers_df_enh = pd.DataFrame(papers_list)  # enh for enhanced

papers_df_enh = (
    papers_df_enh.assign(
        datetime = lambda x: pd.to_datetime(x['datetime']),
        # to ASCII
        abstract = lambda x: x['abstract']
                    .str.replace(r'[^\x00-\x7F]+', ' ', regex=True)  # to ASCII
                    .str.replace('\\\\\(.*\)', '', regex=True),  # delete LATEX
        title = lambda x: x['title'].str.replace(r'[^\x00-\x7F]+', ' ', regex=True),

        keywords = lambda x: x['keywords']
                    # to ASCII
                    .apply(lambda k_list: [re.sub(r'[^\x00-\x7F]+','', k) for k in k_list])


                    # deleting strange new_line characters
                    .apply(lambda k_list: [re.sub(r'\n','', k) for k in k_list])


                    # dealing with parenthesis exp.
                    .apply(lambda k_list: [re.sub(r'\(,\)','', k) for k in k_list])
                    .apply(lambda k_list: [re.sub(r'\(,*\s*','(', k) for k in k_list])
                    .apply(lambda k_list: [re.sub(r'^\)\s*','', k) for k in k_list])

                    .apply(lambda k_list: [re.sub(r', \)',r')', k) for k in k_list])
                        # deleting latex expressions (or other typesetting)
                    .apply(lambda k_list: [re.sub('\\\\\(.*\)', '', k) for k in k_list])
                        # deleting empty parenthesis 
                    .apply(lambda k_list: [re.sub(r'\(\s*\)','', k) for k in k_list])



                    # deleting strange digit keywords (except for 3D, 5G)
                    .apply(lambda k_list: [re.sub(r'\d[^dDGg].*','', k) for k in k_list])


                    # start
                    .apply(lambda k_list: [re.sub(r'^-','', k) for k in k_list])
                    .apply(lambda k_list: [re.sub(r'^#.*','', k) for k in k_list])
                    .apply(lambda k_list: [re.sub(r'^\s*','', k) for k in k_list])
                    .apply(lambda k_list: [re.sub(r'^\*-','', k) for k in k_list])
                    .apply(lambda k_list: [re.sub(r'^-','', k) for k in k_list])

                    # stripping
                    .apply(lambda k_list: [k.strip() for k in k_list])


                    # deleting short
                    .apply(lambda k_list: [k for k in k_list if len(k) > 3])
    
    )
)

In [6]:
# saving proccesed papers
papers_df_enh.to_csv('input_data/text_collections/papers_df_ENHANCED.csv')

In [7]:
key_words = []
for paper_key_words in papers_df_enh['keywords'].to_list():
    for key_w in paper_key_words:
        key_words.append(key_w)
                        

In [8]:
words = []
for key_w in key_words:
    for w in key_w.split():
        words.append(
            w.strip()
            .strip('-').strip()
            .strip('(').strip(')').strip()
            .lower()
        )

---

### Preprocessing DS taxonomy

In [268]:
taxonomy_raw = pd.read_csv('input_data/taxonomies/acm_ccs_taxonomy_reduced_raw.csv', header=None)

In [269]:
taxonomy_raw.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1,Theory of computation,,,,,,,,,
1,1.1.,,Theory and algorithms for application domains,,,,,,,,


In [270]:
def preprocess_taxonomy(taxonomy_raw):
    taxonomy_df = (
    taxonomy_raw
    .fillna('')
    .assign(
        label = lambda x: x.iloc[:, 1:].sum(1).str.strip()
                                              .str.replace('(', '')
                                              .str.replace(')', '')
                                              .str.replace(',', '')
                                              .str.replace('/', '')
                                              .str.replace('  ', ' ')

        
        ,  # extracting nodes labels
        new=lambda x: x[0].str.contains('\*'),  # indicating new nodes
        level = lambda x: x[0]
                            .str.replace('\s*\+\s*0', '', regex=True)
                            .str.replace('\*+', '', regex=True)  # extracting nodes indexes
                            .str.replace(r'\.$', '', regex=True)                       
        ,
        lvls = lambda x: x['level'].str.split('.'), # nodes indexes as lists
        depth = lambda x: x['lvls'].apply(len)  # extracting depth of nodes
    )
    .loc[:, ['level', 'label', 'depth', 'lvls', 'new']]  # reordering in convenient way
)
    return taxonomy_df

In [285]:
taxonomy_df = preprocess_taxonomy(taxonomy_raw)


# changing wrong idx 2.1.2.5.1 --> 2.1.2.5
taxonomy_df.loc[58, 'level'] = '2.1.2.5'
taxonomy_df.loc[58, 'depth']  = 4

# typos
idx = taxonomy_df[taxonomy_df['label']=='Supervised dimesionality reduction'].index
taxonomy_df.at[idx, 'label'] = 'Supervised dimensionality reduction'
idx = taxonomy_df[taxonomy_df['label']=='Modelling'].index
taxonomy_df.at[idx, 'label'] = 'Modeling'
idx = taxonomy_df[taxonomy_df['label']=='Rule-based netwok archirtecture'].index
taxonomy_df.at[idx, 'label'] = 'Rule-based network architecture'



# printing taxonomy description
print(f'N nodes in taxonomy :: {len(taxonomy_df)}')
for depth in range(taxonomy_df.depth.min(), taxonomy_df.depth.max() + 1 ):
    print(f'{(taxonomy_df.depth==depth).sum() : >5} on level {depth}')

N nodes in taxonomy :: 417
    5 on level 1
    9 on level 2
   39 on level 3
  196 on level 4
  157 on level 5
   11 on level 6


#### Enhanced taxonomy preprocessing (new scrapped)

In [286]:
taxonomy_raw_enh = pd.read_csv('input_data/taxonomies/acm_css_taxonomy_ENHANCED_raw.csv', header=None)
taxonomy_raw_enh.drop(axis=0, index=len(taxonomy_raw_enh)-1, inplace=True)  # dropping last dummy row
taxonomy_raw_enh.dropna(axis=0, how='all', inplace=True)  # dropping empty rows

In [287]:
taxonomy_df_enh = preprocess_taxonomy(taxonomy_raw_enh)

idx = taxonomy_df_enh[taxonomy_df_enh['level']=='s'].index
taxonomy_df_enh.at[idx, 'level'] = '1.1.5.9'

# printing taxonomy description
print(f'N nodes in taxonomy :: {len(taxonomy_df_enh)}')
for depth in range(taxonomy_df_enh.depth.min(), taxonomy_df_enh.depth.max() + 1 ):
    print(f'{(taxonomy_df_enh.depth==depth).sum() : >5} on level {depth}')

N nodes in taxonomy :: 456
    6 on level 1
    9 on level 2
   39 on level 3
  220 on level 4
  171 on level 5
   11 on level 6


### AnyTree

In [288]:
def build_AnyTree_tree(taxonomy_df):
    # building anytree object for taxonomy
    '''
    This processing works under assumption that nodes are given in DFS order
    '''

    root = anytree.Node('root', raw_name='root')
    nodes = [root]
    prev_depth = 0
    parent = root
    for _, s in taxonomy_df.iterrows():
        curr_depth = s['depth']
        if curr_depth < prev_depth:
            # lift parent
            for _ in range(prev_depth - curr_depth):
                parent = parent.parent
        elif curr_depth == prev_depth:
            # parent does not change
            pass
        elif curr_depth == prev_depth + 1:
            # parent is the previous node
            parent = nodes[-1]
        else:  # if curr_depth > prev_depth + 1
            print(s)
            raise RuntimeError('Input nodes are not in DFS order. Please sort them in DFS order.')

        n = anytree.Node(f"{s['level']} -- {s['label']}", raw_name=s['label'], parent=parent)
        nodes.append(n)
        prev_depth = curr_depth
    
    return root

In [None]:
root = build_AnyTree_tree(taxonomy_df)

leaves = [i.raw_name for i in anytree.LevelOrderIter(root, filter_=lambda x: x.is_leaf)]
print(f'Number of leaves in Taxonomy: {len((leaves))}')

taxonomy_df['is_leaf'] = taxonomy_df['label'].isin(leaves) # adding leaf indicator

taxonomy_df.to_csv('input_data/taxonomies/taxonomy_df.csv')  # saving taxonomy file

In [267]:
root_enh = build_AnyTree_tree(taxonomy_df_enh)

leaves_enh = [i.raw_name for i in anytree.LevelOrderIter(root_enh, filter_=lambda x: x.is_leaf)]
print(f'Number of leaves in Taxonomy Enhanced: {len((leaves_enh))}')

taxonomy_df_enh['is_leaf'] = taxonomy_df['label'].isin(leaves_enh) # adding leaf indicator

taxonomy_df_enh.to_csv('input_data/taxonomies/taxonomy_df_ENHANCED.csv')  # saving taxonomy file

Number of leaves in Taxonomy Enhanced: 352


In [19]:
# repeating names
unique_names=set()
duplicate_names=set()
for _, s in taxonomy_df.iterrows():
    name = s['label']
    if name in unique_names:
        duplicate_names.add(name)
    else:
        unique_names.add(name)

print(f'N of duplicate pairs :: {len(duplicate_names)}')
duplicate_mask = taxonomy_df['label'].isin(duplicate_names)  

taxonomy_df[duplicate_mask].sort_values(by='label').head(4)

N of duplicate pairs :: 17


Unnamed: 0,level,label,depth,lvls,new,is_leaf
383,5.2.3.7.3.1,2D PCA,6,"[5, 2, 3, 7, 3, 1]",True,True
308,5.1.3.2.1.1,2D PCA,6,"[5, 1, 3, 2, 1, 1]",True,True
23,1.1.1.13.5,Adversarial learning,5,"[1, 1, 1, 13, 5]",False,True
334,5.2.1.3.5,Adversarial learning,5,"[5, 2, 1, 3, 5]",False,True


In [20]:
# repeating names
unique_names=set()
duplicate_names=set()
for _, s in taxonomy_df_enh.iterrows():
    name = s['label']
    if name in unique_names:
        duplicate_names.add(name)
    else:
        unique_names.add(name)

print(f'N of duplicate pairs :: {len(duplicate_names)}')
duplicate_mask = taxonomy_df_enh['label'].isin(duplicate_names)  

taxonomy_df_enh[duplicate_mask].sort_values(by='label').head(4)

N of duplicate pairs :: 19


Unnamed: 0,level,label,depth,lvls,new,is_leaf
422,5.2.3.7.3.1,2D PCA,6,"[5, 2, 3, 7, 3, 1]",True,
340,5.1.3.2.1.1,2D PCA,6,"[5, 1, 3, 2, 1, 1]",True,True
368,5.2.1.3.5,Adversarial learning,5,"[5, 2, 1, 3, 5]",False,True
23,1.1.1.13.5,Adversarial learning,5,"[1, 1, 1, 13, 5]",False,True


---

### Visualising Taxonomy Part

In [21]:
# for requesting nodes path

def get_node_path(index, root):
    '''
    input: node index as a string 
    output: path from root to node as a string
    '''
    def get_full_name(node):
        '''
        input: node as an anytree object
        output: path from root to node as a string
        '''
        ancestors = node.ancestors
        name = '/'
        for anc in ancestors:
            name+=anc.name+'/'
        return name + node.name + '/'
    
    index = list(map(int, index.split('.')))
    index = np.array(index) - 1  # indexation in anytree starts from 0 not 1

    node = root
    for idx in index:
        node = node.children[idx]
        
    return get_full_name(node)
    

In [22]:
from anytree import Resolver
from anytree.exporter import DotExporter
# node = Resolver().get(root, '/root/1 -- Theory of computation/1.1 -- Theory and algorithms for application domains/1.1.1 -- Machine learning theory')
# node = Resolver().get(root, '/root')
index = '3.2.1.4'  # index of node clustering
node = Resolver().get(root, get_node_path(index, root))


dot = DotExporter(node)
dot.to_dotfile(f'visualization/taxonomies/sub_tax_clustering.dot')
!dot -Tpdf visualization/taxonomies/sub_tax_clustering.dot -o visualization/taxonomies/sub_tax_clustering.pdf  # creating pdf 
!rm visualization/taxonomies/sub_tax_clustering.dot  # deleting dot file from directory

---

**Writing <u>toy taxonomy </u> file for visual convenience.**

In [23]:
def save_taxonomy_txt(root, fpath):
    # print in DFS order
    f = open(fpath, 'w')
    for pre, fill, node in anytree.RenderTree(root):
        f.write(f'{pre}{node.name}\n')
        print(pre, node.name, sep='')
    f.close()

In [24]:
save_taxonomy_txt(root, 'visualization/taxonomies/taxonomy_vis_dfs.txt')

root
├── 1 -- Theory of computation
│   └── 1.1 -- Theory and algorithms for application domains
│       ├── 1.1.1 -- Machine learning theory
│       │   ├── 1.1.1.1 -- Sample complexity and generalization bounds
│       │   ├── 1.1.1.2 -- Boolean function learning
│       │   ├── 1.1.1.3 -- Unsupervised learning and clustering
│       │   ├── 1.1.1.4 -- Kernel methods
│       │   │   ├── 1.1.1.4.1 -- Support vector machines
│       │   │   ├── 1.1.1.4.2 -- Gaussian processes
│       │   │   └── 1.1.1.4.3 -- Modelling
│       │   ├── 1.1.1.5 -- Boosting
│       │   ├── 1.1.1.6 -- Bayesian analysis
│       │   ├── 1.1.1.7 -- Inductive inference
│       │   ├── 1.1.1.8 -- Online learning theory
│       │   ├── 1.1.1.9 -- Multi-agent learning
│       │   ├── 1.1.1.10 -- Models of learning
│       │   ├── 1.1.1.11 -- Query learning
│       │   ├── 1.1.1.12 -- Structured prediction
│       │   ├── 1.1.1.13 -- Reinforcement learning
│       │   │   ├── 1.1.1.13.1 -- Sequential decision makin

In [25]:
save_taxonomy_txt(root_enh, 'visualization/taxonomies/taxonomy_vis_dfs_ENHANCED.txt')

root
├── 1 -- Theory of computation
│   └── 1.1 -- Theory and algorithms for application domains
│       ├── 1.1.1 -- Machine learning theory
│       │   ├── 1.1.1.1 -- Sample complexity and generalization bounds
│       │   ├── 1.1.1.2 -- Boolean function learning
│       │   ├── 1.1.1.3 -- Unsupervised learning and clustering
│       │   ├── 1.1.1.4 -- Kernel methods
│       │   │   ├── 1.1.1.4.1 -- Support vector machines
│       │   │   ├── 1.1.1.4.2 -- Gaussian processes
│       │   │   └── 1.1.1.4.3 -- Modelling
│       │   ├── 1.1.1.5 -- Boosting
│       │   ├── 1.1.1.6 -- Bayesian analysis
│       │   ├── 1.1.1.7 -- Inductive inference
│       │   ├── 1.1.1.8 -- Online learning theory
│       │   ├── 1.1.1.9 -- Multi-agent learning
│       │   ├── 1.1.1.10 -- Models of learning
│       │   ├── 1.1.1.11 -- Query learning
│       │   ├── 1.1.1.12 -- Structured prediction
│       │   ├── 1.1.1.13 -- Reinforcement learning
│       │   │   ├── 1.1.1.13.1 -- Sequential decision makin

In [245]:
# Saving the taxonomy as a json dictionary
from anytree.exporter import JsonExporter
exporter = JsonExporter(indent=2)
with open('input_data/taxonomies/taxonomy_dict.json', 'w') as f:
    f.write(exporter.export(root))