# Papers
Find papers given a gene set


In [1]:
!pip install --upgrade pub_worm

Collecting pub_worm
  Downloading pub_worm-0.2.5.tar.gz (387 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m387.5/387.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pub_worm
  Building wheel for pub_worm (setup.py) ... [?25ldone
[?25h  Created wheel for pub_worm: filename=pub_worm-0.2.5-py3-none-any.whl size=391877 sha256=42085d8802a815e7193f9fcdae5049d44b10186c8c001b90edc8c4bd950ae2be
  Stored in directory: /home/dan/.cache/pip/wheels/5d/18/28/4b7727b0595cb9f34d17c73d29f49483553aea878a1f8c040a
Successfully built pub_worm
Installing collected packages: pub_worm
  Attempting uninstall: pub_worm
    Found existing installation: pub-worm 0.2.4
    Uninstalling pub-worm-0.2.4:
      Successfully uninstalled pub-worm-0.2.4
Successfully installed pub_worm-0.2.5


In [2]:
import os
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import seaborn as sns
import umap
import matplotlib.pyplot as plt

# Get the API
from pub_worm.wormbase.wormbase_api import WormbaseAPI
from pub_worm.impact_factor.impact_factor_lookup import get_impact_factor
from pub_worm.ncbi.entreze_api import EntrezAPI

Full path to the file: /home/dan/miniconda3/envs/dan-dev-sc/lib/python3.9/site-packages/pub_worm/impact_factor/data/2022_JCR_IF.csv


In [3]:
# Core Annotation Gene Categories
category_names_core=['Cell cycle', 'Chaperone', 'Cilia', 'Cytoskeleton', 'Development','DNA', 'Extracellular material', 
                     'Globin', 'Lysosome', 'Major sperm protein', 'Metabolism', 'mRNA functions','Muscle function', 
                     'Neuronal function', 'Nuclear pore', 'Nucleic acid', 'Peroxisome', 'Protein modification', 
                     'Proteolysis general', 'Proteolysis proteasome',  'Ribosome', 'Signaling', 'Stress response', 
                     'Trafficking', 'Transcription factor', 'Transcription: chromatin', 'Transcription: dosage compensation', 
                     'Transcription: general machinery']

# Poorly Annotated Genes Categories
category_names_pag = ['Transcription: unassigned', 'Transmembrane protein', 'Transmembrane transport', 'Unassigned']

# Other Annotated Genes Categories
category_names_other = ['Non-coding RNA', 'Pseudogene']

category_names_lst = category_names_core + category_names_pag + category_names_other


In [4]:
# Load the primary Data for Analysis

# Create a dictionary to map to file names
category_names_dict = {}
for category_name in category_names_lst:
    file_name_root = f"{category_name.lower().replace(' ', '_')}"
    category_names_dict[file_name_root]=category_name
    
directory = './input_data/references'
category_dfs = {}

for filename in os.listdir(directory):
    
    if filename.startswith("wc_"):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        wc_category = category_names_dict[filename[3:-4]]
        category_dfs[wc_category]=df
        
total=0
count=0        
reference_data_full_df = pd.DataFrame()
for wc_category in sorted(category_dfs.keys()):
    length = len(category_dfs[wc_category])
    total += length
    count += 1
    print(f"{wc_category:<35} {length:>8,}")
    reference_data_full_df = pd.concat([reference_data_full_df, category_dfs[wc_category]], ignore_index=True)
 
print("="*45)
print(f"Count:{count:>4} {total:>33,}")


Cell cycle                             4,576
Chaperone                                820
Cilia                                  1,851
Cytoskeleton                           7,176
DNA                                    3,248
Development                           11,360
Extracellular material                 6,582
Globin                                   125
Lysosome                               1,527
Major sperm protein                      147
Metabolism                            12,635
Muscle function                        2,438
Neuronal function                     11,956
Nuclear pore                             528
Nucleic acid                           1,735
Peroxisome                               115
Protein modification                   1,967
Proteolysis general                    2,390
Proteolysis proteasome                 3,931
Ribosome                               1,646
Signaling                             26,543
Stress response                       14,596
Traffickin

In [5]:
papers_only_df = reference_data_full_df[~reference_data_full_df['journal'].astype(str).str.lower().str.contains("meeting")]
print(f"papers_only_df {len(papers_only_df):,}")

papers_only_df 121,922


In [6]:
up_down_genes="./input_data/acb16_abc291_up_down.csv"
up_down_genes_df = pd.read_csv(up_down_genes)

up_down_gene_ids_df = pd.DataFrame(up_down_genes_df['ID'])
print(f"Unique Genes {len(up_down_gene_ids_df['ID'].unique()):,}")

Unique Genes 2,052


In [7]:
merged_df = pd.merge(up_down_gene_ids_df, papers_only_df, left_on='ID', right_on='wormbase_id', how='left')
len(merged_df)

15643

In [8]:
len(merged_df['wormbase_id'].unique())

1264

In [9]:
merged_df.head()


Unnamed: 0,ID,id,title,journal,year,author,wormbase_id,category
0,WBGene00000216,WBPaper00030985,Genome-wide investigation reveals pathogen-spe...,Genome Biol,2007.0,Wong D|Bazopoulou D|Pujol N|Tavernarakis N|Ewb...,WBGene00000216,Proteolysis general
1,WBGene00000216,WBPaper00004299,Aspartic proteases from the nematode Caenorhab...,J Biol Chem,2000.0,Tcherepanova IY|Bhattacharyya L|Rubin CS|Freed...,WBGene00000216,Proteolysis general
2,WBGene00000216,WBPaper00062473,Non-canonical necrosis in two different cell t...,G3 (Bethesda),2022.0,Reza RN|Serra ND|Detwiler AC|Hanna-Rose W|Crook M,WBGene00000216,Proteolysis general
3,WBGene00000216,WBPaper00025088,Analysis of long-lived C. elegans daf-2 mutant...,Genome Res,2005.0,Halaschek-Wiener J|Khattra JS|McKay S|Pouzyrev...,WBGene00000216,Proteolysis general
4,WBGene00000216,WBPaper00005753,Dying for a cause: invertebrate genetics takes...,Nat Rev Genet,2003.0,Driscoll M|Gerstbrein B,WBGene00000216,Proteolysis general


In [10]:
# Get all the WBPaper000XXXXX that are unique
merged_not_null_df = merged_df[~merged_df['wormbase_id'].isnull()]
merged_unique_df = merged_not_null_df[~merged_not_null_df['id'].duplicated(keep='first')]
len(merged_unique_df)


6446

In [11]:
def get_pmid(wormbase_id):
    ret_val = None
    wormbase_api = WormbaseAPI("field", "paper", "pmid")
    ret_data = wormbase_api.get_wormbase_data(wormbase_id)
    if 'pmid' in ret_data:
        ret_val = ret_data['pmid']
    return ret_val

In [50]:
def check_issn_essn(json_data):
    if "issn" in json_data:
        return json_data["issn"]
    elif "essn" in json_data:
        return json_data["essn"]
    else:
        return None

def get_issn_essn(pmid, wb_paper_id):
    search_params = {'term':f"{pmid}[UID]"}
    summary_ret_data = EntrezAPI.get_ncbi_data(search_params, "paper_summary")
    if summary_ret_data:
        summary_ret_data['pmid']=pmid
        summary_ret_data['wb_paper_id']=wb_paper_id
        
    issn_essn = check_issn_essn(summary_ret_data)
    if issn_essn:
        imapact_factor = get_impact_factor(issn_essn)
        summary_ret_data['impact_factor']=imapact_factor
    return summary_ret_data
   

    
def collect_ncbi_data(pmid_df):
    ncbi_data_lst = []
    counter=0
    number_of_rows = len(pmid_df)
    for index, row in pmid_df.iterrows():
        print(".", end='')
        counter +=1
        print(f"{row['pmid']:.0f}, {row['id']}")
        issn_essn = get_issn_essn(f"{row['pmid']:.0f}", row['id'])
        print(f"{issn_essn=}")
        ncbi_data_lst.append(issn_essn)

        # Show progress
        if counter % 100 == 0:
            print(f"{counter:>4} of {number_of_rows:>6,}")

    print(f"{counter:>4} of {number_of_rows:>6,}")
    return ncbi_data_lst


In [51]:
print(get_issn_essn('16291722','WBPaper00030985'))

{'source': 'J Cell Sci', 'lastauthor': 'Yoder BK', 'issn': '0021-9533', 'pmid': '16291722', 'wb_paper_id': 'WBPaper00030985', 'impact_factor': 5.235}


In [52]:

#merged_unique_df['pmid'] = merged_unique_df['id'].apply(get_pmid)
merged_unique_df = pd.read_csv('merged_unique_df.csv')
os.environ['NCBI_API_KEY'] = '5a1e23e51e6bf572435b326f1452a339ce08'

In [53]:
#merged_unique_df.to_csv('merged_unique_df.csv')

In [56]:
pmid_df = merged_unique_df[merged_unique_df['pmid'].notnull()]
pmid_df['pmid'].head(10)

0    17875205.0
1    10854422.0
2    35143646.0
3    15837805.0
4    12610523.0
5    17889653.0
6    15943973.0
7    21647448.0
8    18028414.0
9    23091037.0
Name: pmid, dtype: float64

In [57]:
top_10 = pmid_df.head(10)
ncbi_data_lst = collect_ncbi_data(top_10)


.17875205, WBPaper00030985
issn_essn={'source': 'Genome Biol', 'lastauthor': 'Ewbank JJ', 'issn': '1465-6906', 'essn': '1474-760X', 'pmid': '17875205', 'wb_paper_id': 'WBPaper00030985', 'impact_factor': None}
.10854422, WBPaper00004299
issn_essn={'source': 'J Biol Chem', 'lastauthor': 'Freedman JH', 'issn': '0021-9258', 'pmid': '10854422', 'wb_paper_id': 'WBPaper00004299', 'impact_factor': None}
.35143646, WBPaper00062473
issn_essn={'source': 'G3 (Bethesda)', 'lastauthor': 'Crook M', 'essn': '2160-1836', 'pmid': '35143646', 'wb_paper_id': 'WBPaper00062473', 'impact_factor': 3.542}
.15837805, WBPaper00025088
issn_essn={'source': 'Genome Res', 'lastauthor': 'Riddle DL', 'issn': '1088-9051', 'pmid': '15837805', 'wb_paper_id': 'WBPaper00025088', 'impact_factor': 9.438}
.12610523, WBPaper00005753
issn_essn={'source': 'Nat Rev Genet', 'lastauthor': 'Gerstbrein B', 'issn': '1471-0056', 'pmid': '12610523', 'wb_paper_id': 'WBPaper00005753', 'impact_factor': 59.581}
.17889653, WBPaper00031025
is

In [None]:

ncbi_data_df = pd.DataFrame(ncbi_data_lst)
ncbi_data_lst
