# Classify Publications

This experiment aims to classify academic publications into specific research fields, such as Machine Learning, Computer Vision, Natural Language Processing, etc. Using a predefined set of research fields and associated keywords, each title is classified into one or more research fields. The results, including single-label and multi-label classifications with corresponding percentages, are appended as new columns to the dataset and saved as 'ClassifiedPublications.csv'.


In [22]:
import pandas as pd
from tqdm import tqdm

import nltk
import string
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

porter = PorterStemmer()

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
# List of research fields in Tübingen (Source: CS Rankings)
research_fields = [
    'Computer Vision',
    'Robotics',
    'Bioinformatics and Computational Biology',
    'Human-Computer Interaction',
    'Design Automation of Microelectronic Systems (EDA)',
    'Visualization', 
    'Web Information Retrieval',
    'Natural Language Processing',
    'Machine Learning', 
]

# List of keywords per research field
keywords = {
    'Machine Learning': ['data',  'neural', 'network', 'model', 'algorithm', 'convex', 'deep', 'learning', 'gradient', 'bias', 'regression', 'probabilistic', 'random', 'gaussian', 'probability', 'newton', 'bayes', 'bregman', 'optimization', 'clustering', 'supervised', 'unsupervised', 'zero-shot', 'SGD', 'linear', 'Kullback-Leibler', 'robust', 'transformer', 'parameter', 'causal', 'feature', 'selection', 'kernel', 'distribution', 'support vector machine', 'classif', 'support measure machine', 'decision tree', 'statistical', 'regularization', 'certainty', 'attention', 'encoder', 'decoder', 'graph', 'domain', 'line search', 'generalization', 'posterior', 'likelihood', 'inference', 'adversarial', 'invariance', 'risk', 'fair', 'optimal', 'function', 'dimensionality reduction'],
    'Computer Vision': ['vision', 'image', 'pixel', 'video', '2d', '3d', 'reconstruction', 'segmentation', 'scene', 'GAN', 'VR', 'object', 'detection', 'synthesis', 'optical flow', 'radiance', 'supervised', 'autonomous', 'driving', 'urban', 'occupancy', 'grid', 'stereo', 'camera', 'depth', 'zero-shot', 'pose estimation', 'shape estimation', 'geometric', 'curve', 'point', 'cloud', 'face', 'render', 'texture', 'neural fields', 'gaussian splatting', 'CNN', 'RGB', 'RGBD', 'clothing', 'pose', 'shape', 'people', 'registration', 'occlusion', 'geometry', 'convolution', 'photo', 'recognition', 'human shape', 'human pose', 'LiDAR'],
    'Bioinformatics and Computational Biology': ['bio', 'bacteria', 'gene ', 'protein', 'protein sequence', 'biomolecular', 'protein structure', 'SBML', 'biosynthesis', 'disease', 'diagnosis', 'medical', 'health', 'surgeon', 'surgery', 'medicine', 'biological', 'bacterium', 'glutamicum', 'Homology', 'molecular', 'genotype', 'genom', 'drug', 'neural population', 'biomedical', 'biomedicine', 'antibody', 'cancer'],
    'Human-Computer Interaction': ['eye', 'eye tracking', 'eye movement', 'gaze-based', 'eyelid', 'pupil', 'iris', 'gaze', 'head', 'touch', 'grip', 'haptic', 'tactile', 'contact', 'finger', 'limb', 'hug', 'engagement', 'facial videos', 'classroom', 'hand-raising', 'hand raising', 'classroom videos', 'writing', 'pupil diameter', 'digital classrooms', 'human-robot', 'human-robot-interaction', 'human gaze', 'corneal', 'eye features', 'retina', 'retinal projection', 'human activity', 'stress', 'compulsive', 'children', 'eye-based' 'human-robot collaboration', 'eye opening', 'iris gaze', 'raw eye', 'pupil segmentation', 'eye movement feature', 'eye and head tracking', ],
    'Design Automation of Microelectronic Systems (EDA)': ['chip', 'HW', 'hardware', 'hardware accelerator', 'accelerator', 'low-power', 'embedded system', 'embedded software', 'firmware', 'timing simulation', 'scheduling', 'energey-efficient', 'RISC', 'firmware synthesis', 'firmware design', 'time synchronization', 'delay estimation', 'timing estimation', 'semiconductor', 'RTL', 'RTL simulation', 'energy-efficient', 'energy efficiency', 'energy consumption', 'programmable', 'silicon', 'energy saving', 'power optimization', 'processor', 'GPU core', 'multi-processor', 'multi-core', 'multi-processor-SoC', 'multi-core SoC'],
    'Robotics': ['robot', 'SLAM', 'occupancy', 'grid', 'odometry', 'mobile', 'path', 'planning', 'occlusion', 'localization', 'track', 'RFID', 'navigation', 'laser', 'sensor', 'actuator'], 
    'Visualization': ['visualization', 'visualisation', 'visual analysis', 'visual analytics', 'interactive'], 
    'Web Information Retrieval': ['data mining', 'retrieval', 'web', 'crowdsourcing', 'document', 'query', 'relevance', 'retrieval', 'document-level', 'mitigation in ranking', 'dense retrieval', 'retrieval method', 'reranking', 'document embedding', 'search result', 'document reranking', 'document retrieval', 'inconsistent ranking', 'query likelihood', 'retrieval model', 'web search', 'search engine', 'deep retrieval models', 'information retrieval', 'ranking loss', 'ranking balance', 'data mining', 'crowdsourcing', 'web', 'web search', 'website', 'web site', 'search system', 'hyperlink', 'bag-of-hyperlinks', 'web retrieval', 'entity linking'],
    'Natural Language Processing': ['text-to', 'corpus', 'language', 'summarization', 'gpt', 'bert', 'entity', 'natural language', 'NLP', 'natural language processing']
}

# Preprocess keywords
def preprocess_keyword(keyword):
    words = nltk.word_tokenize(keyword)
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words]
    words = [porter.stem(word.lower()) for word in words]
    preprocessed_keyword = ' '.join(words)
    if len(words) == 1 and keyword.count(' ') > 0: 
        preprocessed_keyword = preprocessed_keyword + ' ' * keyword.count(' ')
    return preprocessed_keyword

keywords = {research_field: [preprocess_keyword(word) for word in words] for research_field, words in keywords.items()}

In [42]:
# Preprocess titles
def preprocess_title(title):
    words = nltk.word_tokenize(title)
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words]
    words = [word.lower() for word in words]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word and word not in stop_words]
    words = [porter.stem(word) for word in words]
    preprocessed_title = ' '.join(words)
    return preprocessed_title

# Create a new df (that will contain new columns from the classification)
classified_titles_df = df.copy()

# Create 3 new lists (columns)
research_field_column = [] # Single label classification
research_fields_column = [] # Multilabel classification
research_field_percentages_column = [] # Scores (percentages) of multilabel classification
for i, row in tqdm(classified_titles_df.iterrows(), total=len(classified_titles_df), desc='Classifying Publications'):
    # Preprocess title
    title = preprocess_title(str(row["Title"]))

    # Caculate number of keywords in title for each research field
    row_dict = {research_field: 0 for research_field in research_fields}
    for research_field in research_fields:
        for keyword in keywords[research_field]:
            if keyword in title:
                row_dict[research_field] += 1

    # Convert frequency of keywords to percentages
    row_sum = sum(row_dict.values())
    if row_sum != 0: 
        row_dict = {research_field: value / row_sum for research_field, value in row_dict.items()}

    # Sort and select research fields with percentage > 0.33
    row_research_fields = [research_field for research_field, value in row_dict.items() if value > 0.33]
    row_research_fields = sorted(row_research_fields, key=lambda research_field: row_dict[research_field], reverse=True)
    research_fields_column.append(row_research_fields)

    # Add corresponding percentages
    row_research_field_percentages = [row_dict[research_field] for research_field in row_research_fields]
    research_field_percentages_column.append(row_research_field_percentages)

    # For single label classification, select research field with highest score
    # In case of draw, prefer more specific research fields than Machine Learning
    if len(row_research_fields) != 0:
        if  len(row_research_fields) > 1 and row_research_fields[0] == "Machine Learning" and row_dict[row_research_fields[0]] == row_dict[row_research_fields[1]]:
            research_field_column.append(row_research_fields[1])
        else:
            research_field_column.append(row_research_fields[0])
    else:
        research_field_column.append('')

# Append columns to df
classified_titles_df["Research Field"] = research_field_column
classified_titles_df["Research Fields"] = research_fields_column
classified_titles_df["Research Field Percentages"] = research_field_percentages_column

# Save new dataset
classified_titles_df.to_csv('./../dat/ClassifiedPublications.csv', index=False)

Classifying Publications: 100%|███████████████████████████████████████████████████| 4066/4066 [00:07<00:00, 539.89it/s]
