# Data Parsing

This code builds some helpful object for the analysis out of the raw data files. It is provided mostly for reference.


In [None]:
import json, pickle
import pandas as pd

## 1. Parse Relevant Info From the `soft` File

To save space, files are often compressed. This is the case for most of our included data. It all needs to be extracted.


In [None]:
with open("../../data/GSE8685_family.soft") as f:
    lines = f.readlines()

    on_expression_table = False
    on_platform_table = False
    on_sample = False

    sample = ""
    sample_key = {}

    probe_key = {}
    expression_table = {}
    p_val_table = {}
    transcript_table = {}
    abs_call_table = {}

    for line in lines:

        line = line.strip()

        if line == "!platform_table_begin":
            on_platform_table = True
            continue

        elif line == "!platform_table_end":
            on_platform_table = False
            continue

        elif line == "!sample_table_begin":
            on_expression_table = True
            continue

        elif line == "!sample_table_end":
            on_expression_table = False
            continue

        elif line.split(" = ")[0] == "^SAMPLE":
            sample = line.split(" = ")[1]
            expression_table[sample] = {}
            transcript_table[sample] = {}
            p_val_table[sample] = {}
            abs_call_table[sample] = {}
            continue

        elif line.split(" = ")[0] == "!Sample_title":
            sample_key[sample] = line.split(" = ")[1]
            continue

        line = line.strip().split("\t")

        if on_platform_table:
            if len(line) < 10:
                if line[5] == "":
                    print(line)
                probe_key[line[0]] = line[5]
                continue

            else:
                gene_ids = line[10].split(" /// ")

                probe_key[line[0]] = gene_ids
                continue

        elif on_expression_table:
            if sample == "":
                print(line)
            if line[0] == "ID_REF":
                continue
            else:
                value = line[1]
                p_val = line[3]
                abs_call = line[2]
                probe_id = line[0]
                gene_names = probe_key[probe_id]
                if type(gene_names) == list:
                    for gene_name in gene_names:
                        if not gene_name in expression_table[sample]:
                            expression_table[sample][gene_name] = [float(value)]
                        else:
                            expression_table[sample][gene_name].append(float(value))

                else:
                    if not gene_names in expression_table[sample]:
                        expression_table[sample][gene_names] = [float(value)]
                    else:
                        expression_table[sample][gene_names].append(float(value))
                p_val_table[sample][probe_id] = float(p_val)
                abs_call_table[sample][probe_id] = 1 if abs_call == "P" else 0
                transcript_table[sample][probe_id] = float(value)

## 2. Save the Transcripts Table


In [None]:
transcript_df = pd.DataFrame(transcript_table).dropna()
transcript_df.to_csv("../../data/Microarray_Data.csv")

## 3. Find Which Transcripts Are Present and Save Them


In [None]:
# Bring in the defined experimental groups
experimental_groups = json.load(open("../../data/experimental_groups.json"))

In [None]:
# Make them a DataFrame for easier parsing
abs_call_df = pd.DataFrame(abs_call_table)
abs_call_df

Actually go through all of them and pull out the present ones

In [None]:
present_transcripts = set()
for transcript in abs_call_df.index:
    for condition in experimental_groups:
        if abs_call_df.loc[transcript, experimental_groups[condition]].sum() == 3:
            present_transcripts.add(transcript)
            break
len(present_transcripts)

Save them to a table after filtering

In [None]:
present_transcripts_df = transcript_df.loc[present_transcripts]
present_transcripts_df.to_csv("../../data/expression_by_probe.csv")
present_transcripts_df

## 4. Save the Probe Key

In [None]:
# Get all the probes from 
for probe in probe_key:
    if type(probe_key[probe]) != list:
        probe_key[probe] = [probe_key[probe]]

In [None]:
pickle.dump(probe_key, open("../../data/probe_key.p", "wb"))