In [10]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt

## Load abstracts

In [11]:
# Read all columns and concatenate them into a single dataframe
abs_file_names = ['0-1000', '1001-2000', '2001-3000', '3001-4000', '4001-5000', '5001-6000', '6001-7000', '7001-8000', '8001-9000', '9001-9137']

dfs = []
for file_name in abs_file_names:
    dfs.append(pd.read_csv(f"./raw_data/{file_name}.txt", header = 0, sep="\t"))

abstracts_df = pd.concat(dfs, axis = 0)

In [12]:
# Read the field names stored in the file
all_field_names = pd.read_csv("./raw_data/column_names.csv", header = None, sep=",")
column_names = []
for i in range(len(abstracts_df.columns)):
    column_name = all_field_names[all_field_names[0] == abstracts_df.columns[i]][1].to_string().split('  ')[-1]
    column_names.append(column_name)

#assign the proper column names
abstracts_df.columns = column_names


In [13]:
abstracts_df.drop(abstracts_df[abstracts_df['Language'] !='English'].index, inplace=True)
abstracts_df.drop(abstracts_df[abstracts_df['Document Type' ] != 'Article' ].index, inplace=True)

In [14]:
# Find columns that are mostly empty, drop them
nan_count = abstracts_df.isna().sum()/len(abstracts_df)
# Drop those and the columns with repeptitve information
abstracts_df.drop(columns = list(nan_count[nan_count>0.5].index) + ['Publication Type (J=Journal; B=Book; S=Series;...',
                                                                    'Language','Document Type',
                                                                     'Date this report was generated.',
                                                                      'PubMed ID',
                                                                      '29-Character Source Abbreviation',
                                                                      'Document Delivery Number',
                                                                      'Accession Number'
                                                                      ,'Total Times Cited Count (Web of Science Core C...'], inplace=True)

In [15]:
# Drop more columns with repeptitve information
index_of_extraDOI = list(abstracts_df.columns).index('Digital Object Identifier (DOI)') + 1
abstracts_df.drop(columns = abstracts_df.columns[index_of_extraDOI], inplace=True)
index_of_extraCats = list(abstracts_df.columns).index('Web of Science Categories') + 1
abstracts_df.drop(columns = abstracts_df.columns[index_of_extraCats], inplace=True)

In [16]:
abstracts_df.columns

Index(['Authors', 'Author Full Name', 'Document Title', 'Publication Name',
       'Author Keywords', 'Keywords Plus®', 'Abstract', 'Author Address',
       'Reprint Address', 'E-mail Address', 'ResearcherID Number',
       'ORCID Identifier (Open Researcher and Contribu...',
       'Funding Agency and Grant Number', 'Cited Reference Count',
       'Web of Science Core Collection Times Cited Count',
       'Usage Count (Last 180 Days)', 'Usage Count (Since 2013)', 'Publisher',
       'Publisher City', 'Publisher Address',
       'International Standard Serial Number (ISSN)',
       'Electronic International Standard Serial Numbe...',
       'ISO Source Abbreviation', 'Publication Date', 'Year Published',
       'Volume', 'Issue', 'Beginning Page', 'Ending Page',
       'Digital Object Identifier (DOI)', 'Page Count',
       'Web of Science Categories'],
      dtype='object')

In [17]:
abstracts_df = abstracts_df.rename(columns={'International Standard Serial Number (ISSN)':'ISSN',
                                            'Electronic International Standard Serial Numbe...':'eISSN',
                                             'ORCID Identifier (Open Researcher and Contribu...':'ORCIDs'})

In [18]:
len(abstracts_df)

6335

In [19]:
abstracts_df.to_csv('abstracts_df.csv', '\t')