# Wormbase Expression Cluster Data
Collect Data from Wormbase about Papers written for each `Live`, `Protien Coding Gene` Wormbase Id

---
Wormbase has the below genes we only look at protein_coding_gene
##### Summary of Gene Types for all Wormbase ID's
```
protein_coding_gene      19,985
piRNA_gene               15,363
ncRNA_gene                8,487
pseudogene                2,131
gene                      1,525
tRNA_gene                   634
snoRNA_gene                 346
miRNA_gene                  261
lincRNA_gene                193
snRNA_gene                  129
antisense_lncRNA_gene       100
rRNA_gene                    22
scRNA_gene                    1
```

---
### Findindings
* 4,028 Papers are sourced
* 106 protein coding genes do not has any references
* Wormbase ID Summary of Expression Cluster
```
count    19879.000000
mean       102.675084
std         59.409525
min          1.000000
25%         60.000000
50%         99.000000
75%        136.500000
max        451.000000
```

#### Notes



In [None]:
import numpy as np
import pandas as pd
import math
import requests
import json
import csv
import time
import os
from datetime import datetime

OUTPUT_DATA='./output_data/expression_output'

In [None]:
# This is what the output record will look like
class expression_cluster_record:
    wormbase_id   = 0
    cluster_id    = 1
    cluster_label = 2 
    description   = 3
    header = ['wormbase_id', 'cluster_id', 'cluster_label', 'description']
    empty = [None, None, None, None]
    
output_record = expression_cluster_record

In [None]:
# Flatten the JSON response into a set od records
def wormbase_json_to_dataframe(json_data):
    records = []
    json_data_data = json_data["expression_cluster"]['data']
    if json_data_data == None:
        record = output_record.empty.copy()
        record[output_record.wormbase_id] = json_data["name"]
        records.append(record)
        print(f'No expression_cluster data for {record[output_record.wormbase_id]}')
    else:
        #print(len(expression_cluster_data))
        for index, activity in enumerate(json_data_data):
            record = output_record.empty.copy()
            evidence = None
            record[output_record.wormbase_id] = json_data["name"]
            #print(f"{index=} {'='*80}")
            if 'expression_cluster' in activity:
                record[output_record.cluster_id] = activity['expression_cluster']['id']
                record[output_record.cluster_label] = activity['expression_cluster']['label']
            if 'description' in activity:
                record[output_record.description] = activity['description'][0]
                    
            records.append(record)
            
    return records


In [None]:
# Very simple HTTP call    
def call_wormbase(wormbase_id):
    api_url = f'http://rest.wormbase.org/rest/field/gene/{wormbase_id}/expression_cluster'
    # Absolutley no error checking is done!!
    response = requests.get(api_url)
    json_data = json.loads(response.text)
    return response.status_code, json_data

# Process the results
def process_response(status_code, json_data):
    ret_val = None
    if status_code == 200:
        ret_val = wormbase_json_to_dataframe(json_data)
    else:
        print(f"Error code {status_code}")
        if reason in json_data:
            print(json_data['reason'])
    return ret_val

# write the records to a file 
def write_records(filename, records):
    write_type='w'
    if os.path.isfile(filename):
        write_type='a'

    with open(filename, write_type, newline='') as file:
        writer = csv.writer(file)
        if(write_type == 'w'):
            writer.writerows([output_record.header])
        
        writer.writerows(records)



In [None]:
# iterate a collection of wormbase ids
def process_all(genes_to_evaluate_df, filename):
    
    if os.path.isfile(filename):
        os.remove(filename)

    df_len = len(genes_to_evaluate_df)
    index_pos=0
    for index, row in genes_to_evaluate_df.iterrows():
        try:
            print(f"processing {row[0]} {index_pos+1:04} of {df_len}")
            index_pos +=1
            wormbase_id = row[0]
            status_code, json_data = call_wormbase(wormbase_id)
            records = process_response(status_code, json_data)
            if records:
                #print(records)
                write_records(filename, records)
            time.sleep(.4)
        except Exception as e:
            print(f"An error occurred: {e}")
            #print(json.dumps(json_data,indent=4))
        


In [None]:
# Load the wormbase category list
# Setup. to be queried
wormbase_ids_df = pd.read_csv('./input_data/WB_protein_coding_genes.csv') 
wormbase_ids_df.columns

In [None]:
filename = './output_data/output_expression_cluster/expression_cluster.csv'
### Uncomment to run (This will take appozimately 3 hours)
### process_all(wormbase_ids_df,filename)

## Evaluate results
---
* 106 of 19,985 records did not have an associated expression_cluster returned (NA)
* These rows were dropped

---
* `cluster_id` and `cluster_label` had the same value for all rows `cluster_id` was threfore dropped

In [None]:
filename = './output_data/output_expression_cluster/expression_cluster.csv'
expression_cluster = pd.read_csv(filename)
expression_cluster

### Evaluate and removed rows

* The below code no longer executes as the data was removed
* Saving 86 MB on the disk

In [None]:
# result = expression_cluster['cluster_id'] == expression_cluster['cluster_label']
# false_values = result[result == False]
# false_rows = expression_cluster.loc[false_values.index]
# false_rows

In [None]:
# Expect 2041184 - 106 = 2041078 Rows after drop of NA

# expression_cluster = expression_cluster.dropna(subset=['cluster_id'])
# expression_cluster

In [None]:
# expression_cluster = expression_cluster.drop('cluster_id', axis=1)
# expression_cluster

In [None]:
# filename = './output_expression_cluster/expression_cluster.csv'
# expression_cluster.to_csv(filename,index=False)

In [None]:
expression_cluster_labels =expression_cluster['cluster_label'].value_counts()
expression_cluster_labels.head(50)

In [None]:
wormbase_id_values_counts =expression_cluster['wormbase_id'].value_counts()
wormbase_id_values_counts.describe()