In [1]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import os

## Load Files

In [2]:
# Read all columns and concatenate them into a single dataframe
JIF_df = pd.read_csv(f"IFs_df.csv", header = 0, sep="\t", index_col=0)
Abs_df = pd.read_csv(f"abstracts_df.csv", header = 0, sep="\t", index_col=0)
print(f'JIFs: {len(JIF_df)}; Abs: {len(Abs_df)}')

JIFs: 15232; Abs: 6335


In [3]:
# Join files on ISSN, drop duplicates
new_df = pd.merge(JIF_df, Abs_df[Abs_df['ISSN'].notna()], how = 'right', on=['ISSN', 'Year Published'])
new_df.drop_duplicates(subset=['Document Title'], keep='first', inplace=True, ignore_index=True)
# Join files on eISSN, drop duplicates
new_df_1 = pd.merge(JIF_df, Abs_df[Abs_df['ISSN'].isna()], how = 'right', on=['eISSN', 'Year Published'])
new_df_1.drop_duplicates(subset=['Document Title'], keep='first', inplace=True, ignore_index=True)

# Conctenate joins, drop extra columns
join_df = pd.concat([new_df, new_df_1], axis = 0)
join_df.drop(columns = ['eISSN_x', 'eISSN_y', 'ISSN_x', 'ISSN_y'], inplace=True)
join_df['Year Published'] = join_df['Year Published'].astype(int)
join_df['Publication Name'] = join_df['Publication Name'].apply(lambda x: x.lower())
len(join_df)

6302

In [4]:
# Drop columns irrelevant for the analysis
join_df.drop(['Keywords Plus®', 'E-mail Address', 'ResearcherID Number', 'ORCIDs', 'Publisher', 'Publisher City','ISO Source Abbreviation', 'Publication Date',
       'Volume', 'Issue', 'Beginning Page', 'Ending Page', 'Journal name',
       'Digital Object Identifier (DOI)', 'Author Full Name'], axis=1, inplace=True)

In [5]:
join_df.columns

Index(['ISSN', 'JIF', 'Eigenfactor', 'Year Published', 'Authors',
       'Document Title', 'Publication Name', 'Author Keywords', 'Abstract',
       'Author Address', 'Reprint Address', 'Funding Agency and Grant Number',
       'Cited Reference Count', 'Times cited', 'Usage Count (Last 180 Days)',
       'Usage Count (Since 2013)', 'Publisher Address', 'Page Count',
       'Web of Science Categories', 'eISSN'],
      dtype='object')

In [6]:
#Find journals with missing JIF
missing  = join_df[join_df['JIF'].isna()].groupby(['Publication Name'])['Publication Name'].count().sort_values(ascending=False)
missing.to_csv('missing.csv', '\t')

# Now go and manually download the data for missing journals

  missing.to_csv('missing.csv', '\t')


In [7]:
# Get a list of files in the missing folder
file_list=os.listdir('./raw_data/missing')

In [8]:
# Assemble a dataframe with the JIFs fo rthe missing publications
new_journals = []
for file_name in file_list:
    file_path= f"./raw_data/missing/{file_name}"
    with open(file_path) as f:
        j_name= f.readline().strip().lower()
    df = pd.read_csv(file_path, header = 4, sep=",", index_col=False)
    df['Publication Name'] = j_name
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
    df.drop(df[df['Journal impact factor'].isna()].index, inplace=True)
    df['Year'] = df['Year'].astype(int)
    df = df[['Year', 'Journal impact factor', 'Publication Name']]
    df.columns = ['Year Published', 'JIF_new', 'Publication Name']
    new_journals.append(df)

New_IFs_df = pd.concat(new_journals, axis = 0)

In [9]:
#Join the records with missing JIF with the dataframe of the new JIFS
new_join_df = pd.merge(New_IFs_df, join_df[join_df['JIF'].isna()], how = 'right', on=['Year Published', 'Publication Name'])
new_join_df.drop(['JIF'], axis=1, inplace=True)
new_join_df.rename(columns={'JIF_new':'JIF'}, inplace=True)

In [10]:
#Join the result of the first join and the second join
final_join = pd.concat([join_df, new_join_df], axis = 0)
final_join.dropna(subset = ['JIF'], inplace=True)
final_join.dropna(subset = ['Abstract'], inplace=True)
len(final_join)

4542

In [12]:
final_join.to_csv('join_df.csv', '\t')

  final_join.to_csv('join_df.csv', '\t')


In [13]:
import nltk
nltk.download('punkt')

final_join['Abstract'] = final_join['Abstract'].apply(lambda x: [x for x in nltk.sent_tokenize(x) if 'All rights' not in x]) 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Artem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
final_join['Abstract'] = final_join['Abstract'].apply(lambda x: ' '.join(x))
final_join['Abstract']

1       The synthesis of the first high specific activ...
3       We have developed an efficient and selective r...
4       The first selective dopamine D-4 agonist radio...
5       Lu-177-PSMA radioligand therapy is a promising...
6       [F-18]AZD4694 (2-(2-F-18-fluoro-6-(methylamino...
                              ...                        
3774    Purpose: The C-X-C chemokine receptor 4 (CXCR4...
3776    Recently, promising results of the antitumor e...
3793    Introduction Serotonin is involved in a variet...
3797    Background Major depressive disorder (MDD) is ...
3803    Substance P (SP) is a small peptide commonly k...
Name: Abstract, Length: 4542, dtype: object

In [21]:
final_join = final_join[final_join['JIF']<20]
final_join = final_join[final_join['JIF']>=1]
len(final_join)

4435

In [22]:
final_join.to_csv('new_raw_data.csv', '\t')

  final_join.to_csv('new_raw_data.csv', '\t')
