# Rewite PhenoDigm parser

In [1]:
import pandas as pd
import requests
from retry import retry
import logging
import sys
from datetime import datetime


# class IPMC_solr_parser(object):
#     '''
#     This class retrieves data from the IPMC solr API.
    
#     * Returns array of documents.
#     * If target folder is specified, json files are saved into that folder.
#     '''
    
#     ipmc_solr_host = 'http://www.ebi.ac.uk/mi/impc/solr/phenodigm/select'

    
#     def __init__(self, rows=20000, limit=None, target_folder=None):

#         start = 0
#         rows = 20000
#         total = 0
#         numFound = 1
#         chunk = 0
        
#         timing = []

#         logging.info(f'Retrieving data from IPMC solr: {self.ipmc_solr_host}')
#         logging.info(f'Retrieving {rows} documents at a time.')
#         self.data = []
            
#         # Retrieving all data from IPMC:
#         while True:
#             start_time = datetime.now()
            
#             # Retrieve data:
#             rsp = self.query_solr(start, rows, mode)
            
#             # Increment chunk:
       
#             # We can manually limit the number of documents:
#             if not limit:
#                 limit = rsp['response']['numFound']
            
#             # If we don't find any items, we break:
#             if rsp['response']['numFound'] == 0:
#                 break
                
#             # Store data of write to file:
#             if target_folder:
#                 with open(f'{target_folder}/IMPC_solr_dump.{chunk:03}.json', 'w') as f:
#                     for doc in rsp['response']['docs']:
#                         json.dump(doc, f)
#                         f.write('\n')
#             else:
#                 self.data += rsp['response']['docs']

#             # Incrementing starting position
#             start += rows
#             chunk += 1
#             total += len(rsp['response']['docs'])
            
#             # If the length of the dataframe reaches the limit, we exit:
#             if limit <= total:
#                 break
            
#             # Log progress
#             second_last = (datetime.now() - start_time).total_seconds()
#             logging.debug(f'Chunk {chunk} done. Number of retrieved documents: {total}, last step took: {second_last} seconds.')
            
#             timing.append({'docs': total, 'time': second_last})
         
#         self.timing = pd.DataFrame(timing)
#         logging.info(f'Retrieval finished. Number of documents: {len(self.data)}')
            

#     # Use @retry decorator to ensure that errors like the query failing because server was overloaded, are handled correctly and the request is retried
#     @retry(tries=3, delay=5, backoff=1.2, jitter=(1, 3))
#     def query_solr(self, start, rows, mode):

#         # Building request:
#         params = {'q': '*:*','start': start,'rows': rows}
  
#         params = dict(q="*", start=start, rows=rows)
#         if mode == 'update_cache':
#             params['fq'] = 'type:gene'
            
#         # Query
#         r = requests.get(self.ipmc_solr_host, params=params, timeout=30)

#         # Check for erroneous HTTP response statuses
#         r.raise_for_status()
#         rsp = r.json()
#         return rsp

#     def get_data(self):
#         return self.data.copy()
    
#     def __len__(self):
#         return len(self.data)
    



In [2]:
from importlib import reload
logging.shutdown()
reload(logging)

# Initialize logger:
logging.basicConfig(
    level=logging.ERROR,
    format='%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
)

logging.StreamHandler(sys.stdout)




<StreamHandler stdout (NOTSET)>

### Saving the data into compressed csv

In [127]:
data.to_csv('impc_full_dataset.tsv', sep='\t', index=False, compression='gzip')

In [130]:
data.type.unique()

array(['gene_gene', 'gene', 'disease_search', 'ontology',
       'disease_gene_summary', 'ontology_ontology', 'disease',
       'disease_model_summary', 'mouse_model'], dtype=object)

In [3]:
import os
import gzip

class IPMC_solr_parser(object):
    '''
    This class retrieves data from the IPMC solr API.
    
    * Returns array of documents.
    * If target folder is specified, json files are saved into that folder.
    '''
    
    ipmc_solr_host = 'http://www.ebi.ac.uk/mi/impc/solr/phenodigm/select'

    
    def __init__(self, target_folder, rows=20000, limit=None):
        """
        Storing basic values when initializing object
        
        Args:
        rows (int): Number of solr documents returned in a single query
        limit (int): Maximum number of returned document. If None, all documents are returned
        target_folder (string): Folder into which the data is saved.
        
        Returns: 
        None
        """
        
        self.rows = rows
        self.limit = limit
        self.target_folder = target_folder
        
        
    def fetch_data(self, data_type=None):
        """
        Fetching data to the specified location. If data type is not specified, 
        all types are retrieved and the files are saved directly to the root folder
        
        Args:
        data_type (string): data type to return match .type == 'data_type'
        
        Returns:
        None
        """
        
        # Based on the data type, we update the output folder:
        data_folder = f'{self.target_folder}/type.{data_type}' if data_type else self.target_folder
            
        # Create folder:
        os.makedirs(data_folder, exist_ok=True) 
        
        # Initialize counter:
        start = 0
        total = 0
        numFound = 1
        chunk = 0
        limit = None

        logging.info(f'Retrieving data from IPMC solr: {self.ipmc_solr_host}')
        logging.info(f'Retrieving {self.rows} documents at a time.')
        logging.info(f'Specified data type: {data_type}')
            
        # Retrieving data from IMPC:
        while True:
            
            # Retrieve data:
            solr_data = self.query_solr(start, data_type)
       
            # If limit is not set, return all data:
            if not limit:
                limit = solr_data['response']['numFound'] if not self.limit else self.limit
            
            # If we don't find any items, we break:
            if solr_data['response']['numFound'] == 0:
                break
                
            # Write data to file:
            with gzip.open(f'{data_folder}/IMPC_solr_dump.{chunk:03}.json.gz', 'tw') as f:
                for doc in solr_data['response']['docs']:
                    json.dump(doc, f)
                    f.write('\n')

            # Incrementing counters:
            start += self.rows
            chunk += 1
            total += len(solr_data['response']['docs'])
            
            # If the length of the dataframe reaches the limit, we exit:
            if (limit <= total) or (total == solr_data['response']['numFound']):
                break
            
            # Log progress
            logging.debug(f'Chunk {chunk} done. Number of retrieved documents: {total}.')

            
        logging.info(f'Retrieval finished. Number of documents: {total}')
            

    # Use @retry decorator to ensure that errors like the query failing because server was overloaded, are handled correctly and the request is retried
    @retry(tries=3, delay=5, backoff=1.2, jitter=(1, 3))
    def query_solr(self, start, data_type=None):

        # Building request:
        params = {'q': '*:*','start': start,'rows': self.rows}

        if data_type:
            params['fq'] = f'type:{data_type}'
            
        # Query
        r = requests.get(self.ipmc_solr_host, params=params, timeout=30)

        # Check for erroneous HTTP response statuses
        r.raise_for_status()
        rsp = r.json()
        return rsp


# target_folder = '/Users/dsuveges/project/random_notebooks/issue-1284-Phenodigm_rewrite/cicaful'
# impc_solr_retriever = IPMC_solr_parser(rows=20000, limit=50000, target_folder=target_folder)
# impc_solr_retriever.fetch_data()


In [5]:
import json 

# Retrieving samples from every data types:
# limit = 100000
target_folder = '/Users/dsuveges/project/random_notebooks/issue-1284-Phenodigm_rewrite/cicaful'

data_types = ['gene','gene_gene','mouse_model','disease_model_summary',
              'disease_gene_summary','disease','ontology_ontology','ontology']


# Initialize impc solr object:
impc_solr_retriever = IPMC_solr_parser(rows=20000, target_folder=target_folder)

for data_type in data_types:
    print(f'fetching: {data_type}')
    impc_solr_retriever.fetch_data(data_type=data_type)

fetching: gene
fetching: gene_gene
fetching: mouse_model
fetching: disease_model_summary
fetching: disease_gene_summary
fetching: disease
fetching: ontology_ontology
fetching: ontology


In [66]:
# Initialize impc solr object:
impc_solr_retriever = IPMC_solr_parser(rows=20000, limit=1000000, target_folder=target_folder)
impc_solr_retriever.fetch_data(data_type='mouse_model')

In [6]:
%%bash

pwd

/Users/dsuveges/project/random_notebooks/issue-1284-Phenodigm_rewrite
