# Keywords Extraction from DHQ Articles
### Created by Hoyeol Kim

In [1]:
pip install transformers



In [2]:
from transformers import (TokenClassificationPipeline, AutoModelForTokenClassification, AutoTokenizer)
from transformers.pipelines import AggregationStrategy
import numpy as np
import csv

In [3]:
fields = []
data = []
row_ct = 0

with open("2022-dhq-articles-with-abstracts.tsv") as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        if row_ct == 0:
            fields = row
        else:
            data.append(row)
        row_ct += 1

In [4]:
# here are the headings:
print(fields)
print("Number of papers: " + str(len(data)))

['Article ID', 'Pub. Year', 'Volume and Issue', 'Authors', 'Affiliations', 'Title', 'Abstract', '# of Cited Works']
Number of papers: 643


In [5]:
# here, we sort the data by paper ID
sorted_data = []
for i in range(1, len(data)*10):
    for row in data:
        ID = int(row[0])
        if ID == i:
            sorted_data.append(row)

data = sorted_data

In [6]:
# Define keyphrase extraction pipeline
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, all_outputs):
        results = super().postprocess(
            all_outputs=all_outputs,
            aggregation_strategy=AggregationStrategy.FIRST,
        )
        return np.unique([result.get("word").strip() for result in results])


In [7]:
# Load pipeline
model_name = "ml6team/keyphrase-extraction-distilbert-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [8]:
# Predict keywords
keywords = []

for i in range(len(data)):
  keywords.append(extractor(data[i][6]))

In [9]:
# format papers for input
papers = []
for i in range(0, len(data)):
    paper = {}
    paper['ID'] = int(data[i][0])
    paper['Pub. Year'] = int(data[i][1])
    paper['Volume and Issue'] = float(data[i][2])
    paper['Authors'] = data[i][3]
    paper['Affiliations'] = data[i][4]
    paper['Title'] = data[i][5]
    paper['Abstract'] = data[i][6]
    paper['Keywords'] = keywords[i]
    paper['# of Cited Works'] = data[i][7]
    papers.append(paper)

In [10]:
print(papers[0])

{'ID': 1, 'Pub. Year': 2007, 'Volume and Issue': 1.1, 'Authors': 'Johanna Drucker', 'Affiliations': 'University of Virginia', 'Title': 'Philosophy and Digital Humanities: A review of Willard McCarty, Humanities Computing (London and NY: Palgrave, 2005)', 'Abstract': "A review of Willard McCarty's Humanities Computing.", 'Keywords': array(['humanities computing'], dtype='<U20'), '# of Cited Works': '4'}


In [11]:
# Specify the CSV file path
csv_file_path = 'dhq_keywords.csv'

# Specify column names
columns = ['ID', 'Pub. Year', 'Volume and Issue', 'Authors', 'Affiliations', 'Title', 'Abstract', 'Keywords', '# of Cited Works']

# Open the CSV file in write mode
with open(csv_file_path, mode='w', newline='') as csv_file:
    # Create a CSV writer object
    csv_writer = csv.DictWriter(csv_file, fieldnames=columns)

    # Write the header row
    csv_writer.writeheader()

    # Write the data rows
    for row in papers:
        csv_writer.writerow(row)