In [38]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import os

## Load Files

In [39]:
# Read all columns and concatenate them into a single dataframe
JIF_df = pd.read_csv(f"IFs_df.csv", header = 0, sep="\t", index_col=0)
Abs_df = pd.read_csv(f"abstracts_df.csv", header = 0, sep="\t", index_col=0)
print(f'JIFs: {len(JIF_df)}; Abs: {len(Abs_df)}')

JIFs: 15232; Abs: 6335


In [40]:
# Join files on ISSN, drop duplicates
new_df = pd.merge(JIF_df, Abs_df[Abs_df['ISSN'].notna()], how = 'right', on=['ISSN', 'Year Published'])
new_df.drop_duplicates(subset=['Document Title'], keep='first', inplace=True, ignore_index=True)
# Join files on eISSN, drop duplicates
new_df_1 = pd.merge(JIF_df, Abs_df[Abs_df['ISSN'].isna()], how = 'right', on=['eISSN', 'Year Published'])
new_df_1.drop_duplicates(subset=['Document Title'], keep='first', inplace=True, ignore_index=True)

# Conctenate joins, drop extra columns
join_df = pd.concat([new_df, new_df_1], axis = 0)
join_df.drop(columns = ['eISSN_x', 'eISSN_y', 'ISSN_x', 'ISSN_y'], inplace=True)
join_df['Year Published'] = join_df['Year Published'].astype(int)
join_df['Publication Name'] = join_df['Publication Name'].apply(lambda x: x.lower())
len(join_df)

6302

In [41]:
# Drop columns irrelevant for the analysis
join_df.drop(['Keywords Plus®', 'E-mail Address', 'ResearcherID Number', 'ORCIDs', 'Publisher', 'Publisher City','ISO Source Abbreviation', 'Publication Date',
       'Volume', 'Issue', 'Beginning Page', 'Ending Page', 'Journal name',
       'Digital Object Identifier (DOI)', 'Author Full Name'], axis=1, inplace=True)

In [42]:
join_df.columns

Index(['ISSN', 'JIF', 'Eigenfactor', 'Year Published', 'Authors',
       'Document Title', 'Publication Name', 'Author Keywords', 'Abstract',
       'Author Address', 'Reprint Address', 'Funding Agency and Grant Number',
       'Cited Reference Count', 'Times cited', 'Usage Count (Last 180 Days)',
       'Usage Count (Since 2013)', 'Publisher Address', 'Page Count',
       'Web of Science Categories', 'eISSN'],
      dtype='object')

In [43]:
#Find journals with missing JIF
missing  = join_df[join_df['JIF'].isna()].groupby(['Publication Name'])['Publication Name'].count().sort_values(ascending=False)
missing.to_csv('missing.csv', '\t')

# Now go and manually download the data for missing journals

  missing.to_csv('missing.csv', '\t')


In [44]:
# Get a list of files in the missing folder
file_list=os.listdir('./raw_data/missing')

In [45]:
# Assemble a dataframe with the JIFs fo rthe missing publications
new_journals = []
for file_name in file_list:
    file_path= f"./raw_data/missing/{file_name}"
    with open(file_path) as f:
        j_name= f.readline().strip().lower()
    df = pd.read_csv(file_path, header = 4, sep=",", index_col=False)
    df['Publication Name'] = j_name
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
    df.drop(df[df['Journal impact factor'].isna()].index, inplace=True)
    df['Year'] = df['Year'].astype(int)
    df = df[['Year', 'Journal impact factor', 'Publication Name']]
    df.columns = ['Year Published', 'JIF_new', 'Publication Name']
    new_journals.append(df)

New_IFs_df = pd.concat(new_journals, axis = 0)

In [46]:
#Join the records with missing JIF with the dataframe of the new JIFS
new_join_df = pd.merge(New_IFs_df, join_df[join_df['JIF'].isna()], how = 'right', on=['Year Published', 'Publication Name'])
new_join_df.drop(['JIF'], axis=1, inplace=True)
new_join_df.rename(columns={'JIF_new':'JIF'}, inplace=True)

In [50]:
#Join the result of the first join and the second join
final_join = pd.concat([join_df, new_join_df], axis = 0)
final_join.dropna(subset = ['JIF'], inplace=True)
final_join.dropna(subset = ['Abstract'], inplace=True)
len(final_join)

4542

In [51]:
final_join.head()

Unnamed: 0,ISSN,JIF,Eigenfactor,Year Published,Authors,Document Title,Publication Name,Author Keywords,Abstract,Author Address,Reprint Address,Funding Agency and Grant Number,Cited Reference Count,Times cited,Usage Count (Last 180 Days),Usage Count (Since 2013),Publisher Address,Page Count,Web of Science Categories,eISSN
1,0960-894X,2.538,,2006,"Raab, CE; Butcher, JW; Connolly, TM; Karczewsk...",Synthesis of the first sulfur-35-labeled hERG ...,bioorganic & medicinal chemistry letters,S-35; hERG; radioligand; synthesis,The synthesis of the first high specific activ...,"Merck Res Labs, Dept Drug Metab, Rahway, NJ 07...","Raab, CE (corresponding author), Merck Res Lab...",,18,24,0,0,"THE BOULEVARD, LANGFORD LANE, KIDLINGTON, OXFO...",4,"Chemistry, Medicinal; Chemistry, Organic",
3,0960-894X,2.333,,2004,"Su, DS; Markowitz, MK; Murphy, KL; Wan, BL; Zr...",Development of an efficient and selective radi...,bioorganic & medicinal chemistry letters,Bradykinin B-1; receptor; antagonist; radiolig...,We have developed an efficient and selective r...,"Merck Res Labs, Dept Med Chem, W Point, PA 194...","Su, DS (corresponding author), Merck Res Labs,...",,17,21,0,3,"THE BOULEVARD, LANGFORD LANE, KIDLINGTON, OXFO...",4,"Chemistry, Medicinal; Chemistry, Organic",
4,0960-894X,2.333,,2004,"Matulenko, MA; Surber, B; Fan, LM; Kolasa, T; ...",Synthesis and activity of 2-[4-(4-[<SUP>3</SUP...,bioorganic & medicinal chemistry letters,dopamine; agonist; radioligand,The first selective dopamine D-4 agonist radio...,"Abbott Labs, Global Pharmaceut Res & Dev, Neur...","Matulenko, MA (corresponding author), Abbott L...",,15,12,0,2,"THE BOULEVARD, LANGFORD LANE, KIDLINGTON, OXFO...",4,"Chemistry, Medicinal; Chemistry, Organic",
5,0161-5505,11.082,0.029,2021,"Treiber, H; König, A; Neesse, A; Richter, A; S...",Liver Enzyme Elevation After <SUP>177</SUP>Lu-...,journal of nuclear medicine,genitourinary oncology; radionuclide therapy; ...,Lu-177-PSMA radioligand therapy is a promising...,"[Treiber, Hannes] Univ Med Ctr Gottingen, Dept...","Treiber, H (corresponding author), Univ Med Ct...",,7,3,1,5,"1850 SAMUEL MORSE DR, RESTON, VA 20190-5316 USA",4,"Radiology, Nuclear Medicine & Medical Imaging",
6,1536-1632,2.774,0.006,2014,"Schou, M; Varnäs, K; Sandell, J; Johnström, P;...","Synthesis, Radiolabeling, and <i>In Vivo</i> P...",molecular imaging and biology,PET; Radioligand; Amyloid; AZD4694; Carbon-11,[F-18]AZD4694 (2-(2-F-18-fluoro-6-(methylamino...,"[Schou, Magnus; Johnstrom, Peter; Cselenyi, Zs...","Schou, M (corresponding author), Karolinska Un...",,25,9,0,14,"233 SPRING ST, NEW YORK, NY 10013 USA",7,"Radiology, Nuclear Medicine & Medical Imaging",


In [52]:
final_join.to_csv('join_df.csv', '\t')

  final_join.to_csv('join_df.csv', '\t')
