# Data 
Input: 
gene_symbol_expression_data.txt (output from 'gene_expression_processing_pipeline.R'), vmh_gene_convert.txt (vmh website), 
recon3_genes.txt  (model file used) 

Output: 
combined_recon3_ID_expression_W5D2T0_W5D2T120.txt,
filtered_expression_data_W5D3T0.txt, 
filtered_expression_data_W5D3T120.txt, 
filtered_expression_data_W5D3T240.txt

In [1]:
#run this section to implement the three functions below: 
#note we will only be using the third function Recon2_GeneExpression_File

def Make_Recon2_List(file_name):
    #short function just to open and format the recon2 gene names  in the recon2_gene.txt file into a list
    
    #open file
    model_file=open(file_name,'r')
    #read lines
    model_lines=model_file.readlines()
    #for each line/gene remove unwanted features
    retain_genes=[]
    for gene in model_lines:
        gene_string=gene.strip("\n")
        gene_name=gene_string.strip("''")
        retain_genes.append(gene_name)
    #return list of gene names 
    return retain_genes

def Make_Gene_Conversion_Dic(vmh_file,model_file):
    #short function of make a dictionary that converts gene symbol to gene ID for recon2
    #take in two .txt files:
    #the vmh_gene_convert.txt file which contains all gene conversions for recon3
    #the recon2_gene.txt file which contains all genes in the recon2 model you are using
    #the function returns a dictionary where the gene symbols are keys
    
    
    #make list of recon2 genes
    retain_genes=Make_Recon2_List(model_file)
    #open vmh file
    vmh_genelist=open(vmh_file,'r')
    vmh_lines=vmh_genelist.readlines()
    #read in each line of vmh file
    gene_conversion={}
    for entry in vmh_lines:
        gene_info=entry.split("\t")
        #if the gene identifier is in our recon list we add it to our conversion dictionary
        if gene_info[0] in retain_genes:
            gene_label=gene_info[1]
            gene_symbol=gene_label.strip("\n")
            gene_conversion.update({gene_symbol:gene_info[0]})
    #return conversion dictionary
    return gene_conversion

def Recon2_GeneExpression_File(expression_data_file,vmh_gene_conversion,recon2_gene_list,outfile):
    #function to extract only the gene expression values that can be mapped to recon2
    #function takes in three .txt files":
    #expression_data_file: your gene expression data in .txt format structured to only have gene symbol as an identifier
    #vmh_gene_conversion: vmh_gene_convert.txt file which contains all gene conversions for recon3
    #recon2_gene_list: recon2_gene.txt file which contains all genes in the recon2 model you are using
    
    #make your identifier conversion dictionary
    conversion_dic=Make_Gene_Conversion_Dic(vmh_gene_conversion,recon2_gene_list)
    
    #open gene expression values
    exprs_file=open(expression_data_file,'r')
    gene_exprs=exprs_file.readlines()
    
    out_list=[gene_exprs[0]]
    gene_match=[]
    #for each row/gene in the gene expression file, check in the gene symbol is in the conversion dic
    #if it is, then replace the gene symbol with the HUGO gene identifier and store in outlist
    exprs_file=open(expression_data_file,'r')
    gene_exprs=exprs_file.readlines()[1:]
    for gene in gene_exprs:
        symbol_exprs=gene.split("\t",1)
        if symbol_exprs[0] in conversion_dic:
            gene_match.append(symbol_exprs[0])
            new_ID=conversion_dic[symbol_exprs[0]]
            new_exprs=new_ID+"\t"+symbol_exprs[1]
            out_list.append(new_exprs)
    #open file to read out data        
    o_file=open(outfile,'w')
    for item in out_list:
        o_file.write(item)
    return gene_match

In [2]:
matched_genes=Recon2_GeneExpression_File('gene_symbol_expression_data.txt','vmh_gene_convert.txt','recon3_genes.txt','recon3_ID_expression.txt')







In [3]:
matched_genes

['B3GALT6',
 'PLCH2',
 'CA6',
 'H6PD',
 'PIK3CD',
 'NMNAT1',
 'PGD',
 'PLA2G5',
 'PLA2G2F',
 'CDA',
 'HS6ST1',
 'ALPL',
 'SDHD',
 'EXTL1',
 'PIGV',
 'AZIN2',
 'NDUFS5',
 'OXCT2',
 'CTPS1',
 'PPCS',
 'ST3GAL3',
 'ATP6V0B',
 'B4GALT2',
 'UROD',
 'AKR1A1',
 'UQCRH',
 'CYP4B1',
 'CYP4X1',
 'CYP4Z1',
 'CMPK1',
 'SLC5A9',
 'GPX7',
 'SCP2',
 'CPT2',
 'DIO1',
 'ACOT11',
 'FGGY',
 'PGM1',
 'AK4',
 'PDE4B',
 'CTH',
 'FPGT',
 'ACADM',
 'ST6GALNAC3',
 'ST6GALNAC5',
 'AK5',
 'HS2ST1',
 'ABCD3',
 'AGL',
 'SLC35A3',
 'AMY2B',
 'AMY2A',
 'AMY2A',
 'AMY2A',
 'AMPD2',
 'GSTM2',
 'GSTM1',
 'AHCYL1',
 'CEPT1',
 'CHIA',
 'ATP5F1',
 'ATP1A1',
 'HAO2',
 'HSD3B2',
 'HSD3B1',
 'PHGDH',
 'CA14',
 'SETDB1',
 'PIP5K1A',
 'NPR1',
 'SLC27A3',
 'AQP10',
 'ATP8B2',
 'FLAD1',
 'FDPS',
 'RHBG',
 'ATP1A2',
 'ATP1A4',
 'PPOX',
 'NDUFS2',
 'SDHC',
 'UAP1',
 'HSD17B7P2',
 'MGST3',
 'UCK2',
 'ATP1B1',
 'PRDX6',
 'FAM20B',
 'SOAT1',
 'PLA2G4A',
 'ATP2B4',
 'PFKFB2',
 'HSD11B1',
 'FLVCR1',
 'MIA3',
 'DEGS1',
 'GUK1',
 'GALNT2

In [5]:
import pandas as pd


# Load the corrected gene expression data
expression_data_path ='/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 6_/Convert_GeneSymbol_HUGO/recon3_ID_expression.txt'
expression_data = pd.read_csv(expression_data_path, sep="\t")


print(expression_data.head())

     SYMBOL  GSM2346493_A01_NT001.W5D2T0.CEL.gz  \
0  126792.1                            5.525737   
1    9651.1                            5.486661   
2     765.1                            3.689322   
3    9563.1                            6.581042   
4    5293.1                            8.336759   

   GSM2346494_A01_NT010.W5D2T0.CEL.gz  GSM2346495_A01_NT018.W5D2T0.CEL.gz  \
0                            5.647605                            5.454347   
1                            5.720119                            5.812810   
2                            4.363882                            3.467474   
3                            6.339777                            6.572965   
4                            8.310441                            8.338452   

   GSM2346496_A01_NT025.W5D2T0.CEL.gz  GSM2346497_A01_NT034.W5D2T0.CEL.gz  \
0                            5.732654                            5.681445   
1                            5.688417                            5.857181   

In [4]:
import pandas as pd

# Load the corrected gene expression data
expression_data_path = '/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 6_/Convert_GeneSymbol_HUGO/recon3_ID_expression.txt'
expression_data = pd.read_csv(expression_data_path, sep="\t")

# Print the first few rows of the data
print(expression_data.head())



     SYMBOL  GSM2346493_A01_NT001.W5D2T0.CEL.gz  \
0  126792.1                            5.525737   
1    9651.1                            5.486661   
2     765.1                            3.689322   
3    9563.1                            6.581042   
4    5293.1                            8.336759   

   GSM2346494_A01_NT010.W5D2T0.CEL.gz  GSM2346495_A01_NT018.W5D2T0.CEL.gz  \
0                            5.647605                            5.454347   
1                            5.720119                            5.812810   
2                            4.363882                            3.467474   
3                            6.339777                            6.572965   
4                            8.310441                            8.338452   

   GSM2346496_A01_NT025.W5D2T0.CEL.gz  GSM2346497_A01_NT034.W5D2T0.CEL.gz  \
0                            5.732654                            5.681445   
1                            5.688417                            5.857181   

In [8]:
# Print all column names
print("Column Names:")
print(expression_data.columns)

Column Names:
Index(['SYMBOL', 'GSM2346493_A01_NT001.W5D2T0.CEL.gz',
       'GSM2346494_A01_NT010.W5D2T0.CEL.gz',
       'GSM2346495_A01_NT018.W5D2T0.CEL.gz',
       'GSM2346496_A01_NT025.W5D2T0.CEL.gz',
       'GSM2346497_A01_NT034.W5D2T0.CEL.gz',
       'GSM2346498_A01_NT045.W5D2T0.CEL.gz',
       'GSM2346499_A01_NT056.W5D2T0.CEL.gz',
       'GSM2346500_A01_NT066.W5D2T0.CEL.gz',
       'GSM2346501_A01_NT077.W5D2T0.CEL.gz',
       ...
       'GSM2347705_H08_NT041.W5D2T120.CEL.gz',
       'GSM2347706_H08_NT141.W5D2T120.CEL.gz',
       'GSM2347707_H08_NT142.W5D2T120.CEL.gz',
       'GSM2347709_H09_NT041.W18D2T0.CEL.gz',
       'GSM2347710_H09_NT141.W18D2T0.CEL.gz',
       'GSM2347711_H09_NT142.W18D2T0.CEL.gz',
       'GSM2347712_H10_NT022.W18D3T120.CEL.gz',
       'GSM2347721_H12_NT041.W18D2T120.CEL.gz',
       'GSM2347722_H12_NT141.W18D2T120.CEL.gz',
       'GSM2347723_H12_NT142.W18D2T120.CEL.gz'],
      dtype='object', length=547)


In [6]:
# Create a list of columns to keep
columns_to_keep = ['SYMBOL']  # Start with the SYMBOL column
for column in expression_data.columns:
    if column.endswith("W5D2T0.CEL.gz") or column.endswith("W5D2T120.CEL.gz"):
        columns_to_keep.append(column)

# Filter the DataFrame to keep only the relevant columns
filtered_expression_data = expression_data[columns_to_keep]

# Save the filtered data to a new file
output_path = '/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 8/filtered_recon3_ID_expression.txt'
filtered_expression_data.to_csv(output_path, sep="\t", index=False)

print(f"Filtered data saved to {output_path}")
print(filtered_expression_data.head())

Filtered data saved to /Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 8/filtered_recon3_ID_expression.txt
     SYMBOL  GSM2346493_A01_NT001.W5D2T0.CEL.gz  \
0  126792.1                            5.525737   
1    9651.1                            5.486661   
2     765.1                            3.689322   
3    9563.1                            6.581042   
4    5293.1                            8.336759   

   GSM2346494_A01_NT010.W5D2T0.CEL.gz  GSM2346495_A01_NT018.W5D2T0.CEL.gz  \
0                            5.647605                            5.454347   
1                            5.720119                            5.812810   
2                            4.363882                            3.467474   
3                            6.339777                            6.572965   
4                            8.310441            

In [8]:
import pandas as pd

# Load the gene expression data
expression_data_path = '/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 6_/Convert_GeneSymbol_HUGO/recon3_ID_expression.txt'
expression_data = pd.read_csv(expression_data_path, sep="\t")

# Display all column names
print("Column Names:")
print(expression_data.columns)

# Filter columns for "W5D2T0"
columns_to_keep_W5D2T0 = ['SYMBOL']
for column in expression_data.columns:
    if column.endswith("W5D2T0.CEL.gz"):
        columns_to_keep_W5D2T0.append(column)

filtered_expression_data_W5D2T0 = expression_data[columns_to_keep_W5D2T0]

# Save the filtered data for "W5D2T0" to a new file
output_path_W5D2T0 = '/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 8/filtered_recon3_ID_expression_W5D2T0.txt'
filtered_expression_data_W5D2T0.to_csv(output_path_W5D2T0, sep="\t", index=False)

print(f"Filtered data for W5D2T0 saved to {output_path_W5D2T0}")
print(filtered_expression_data_W5D2T0.head())

# Filter columns for "W5D2T120"
columns_to_keep_W5D2T120 = ['SYMBOL']
for column in expression_data.columns:
    if column.endswith("W5D2T120.CEL.gz"):
        columns_to_keep_W5D2T120.append(column)

filtered_expression_data_W5D2T120 = expression_data[columns_to_keep_W5D2T120]

# Save the filtered data for "W5D2T120" to a new file
output_path_W5D2T120 = '/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 8/filtered_recon3_ID_expression_W5D2T120.txt'
filtered_expression_data_W5D2T120.to_csv(output_path_W5D2T120, sep="\t", index=False)

print(f"Filtered data for W5D2T120 saved to {output_path_W5D2T120}")
print(filtered_expression_data_W5D2T120.head())


Column Names:
Index(['SYMBOL', 'GSM2346493_A01_NT001.W5D2T0.CEL.gz',
       'GSM2346494_A01_NT010.W5D2T0.CEL.gz',
       'GSM2346495_A01_NT018.W5D2T0.CEL.gz',
       'GSM2346496_A01_NT025.W5D2T0.CEL.gz',
       'GSM2346497_A01_NT034.W5D2T0.CEL.gz',
       'GSM2346498_A01_NT045.W5D2T0.CEL.gz',
       'GSM2346499_A01_NT056.W5D2T0.CEL.gz',
       'GSM2346500_A01_NT066.W5D2T0.CEL.gz',
       'GSM2346501_A01_NT077.W5D2T0.CEL.gz',
       ...
       'GSM2347705_H08_NT041.W5D2T120.CEL.gz',
       'GSM2347706_H08_NT141.W5D2T120.CEL.gz',
       'GSM2347707_H08_NT142.W5D2T120.CEL.gz',
       'GSM2347709_H09_NT041.W18D2T0.CEL.gz',
       'GSM2347710_H09_NT141.W18D2T0.CEL.gz',
       'GSM2347711_H09_NT142.W18D2T0.CEL.gz',
       'GSM2347712_H10_NT022.W18D3T120.CEL.gz',
       'GSM2347721_H12_NT041.W18D2T120.CEL.gz',
       'GSM2347722_H12_NT141.W18D2T120.CEL.gz',
       'GSM2347723_H12_NT142.W18D2T120.CEL.gz'],
      dtype='object', length=547)
Filtered data for W5D2T0 saved to /Users/douglas/Libra

In [9]:
#combine both columns for matlab
import pandas as pd

# Load the gene expression data
expression_data_path = '/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 6_/Convert_GeneSymbol_HUGO/recon3_ID_expression.txt'
expression_data = pd.read_csv(expression_data_path, sep="\t")

# Display all column names
print("Column Names:")
print(expression_data.columns)

# Filter columns for "W5D2T0"
columns_to_keep_W5D2T0 = ['SYMBOL']
for column in expression_data.columns:
    if column.endswith("W5D2T0.CEL.gz"):
        columns_to_keep_W5D2T0.append(column)

filtered_expression_data_W5D2T0 = expression_data[columns_to_keep_W5D2T0]

# Save the filtered data for "W5D2T0" to a new file
output_path_W5D2T0 = '/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 8/filtered_recon3_ID_expression_W5D2T0.txt'
filtered_expression_data_W5D2T0.to_csv(output_path_W5D2T0, sep="\t", index=False)

print(f"Filtered data for W5D2T0 saved to {output_path_W5D2T0}")
print(filtered_expression_data_W5D2T0.head())

# Filter columns for "W5D2T120"
columns_to_keep_W5D2T120 = ['SYMBOL']
for column in expression_data.columns:
    if column.endswith("W5D2T120.CEL.gz"):
        columns_to_keep_W5D2T120.append(column)

filtered_expression_data_W5D2T120 = expression_data[columns_to_keep_W5D2T120]

# Save the filtered data for "W5D2T120" to a new file
output_path_W5D2T120 = '/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 8/filtered_recon3_ID_expression_W5D2T120.txt'
filtered_expression_data_W5D2T120.to_csv(output_path_W5D2T120, sep="\t", index=False)

print(f"Filtered data for W5D2T120 saved to {output_path_W5D2T120}")
print(filtered_expression_data_W5D2T120.head())

# Load the filtered data
filtered_W5D2T0 = pd.read_csv(output_path_W5D2T0, sep="\t")
filtered_W5D2T120 = pd.read_csv(output_path_W5D2T120, sep="\t")

# Remove the first column (gene symbols) from W5D2T120
filtered_W5D2T120 = filtered_W5D2T120.iloc[:, 1:]

# Combine the data
combined_data = pd.concat([filtered_W5D2T0, filtered_W5D2T120], axis=1)

# Save the combined data to a new file
output_path_combined = '/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/python_/week 8/combined_recon3_ID_expression_W5D2T0_W5D2T120.txt'
combined_data.to_csv(output_path_combined, sep="\t", index=False)

print(f"Combined data saved to {output_path_combined}")
print(combined_data.head())





Column Names:
Index(['SYMBOL', 'GSM2346493_A01_NT001.W5D2T0.CEL.gz',
       'GSM2346494_A01_NT010.W5D2T0.CEL.gz',
       'GSM2346495_A01_NT018.W5D2T0.CEL.gz',
       'GSM2346496_A01_NT025.W5D2T0.CEL.gz',
       'GSM2346497_A01_NT034.W5D2T0.CEL.gz',
       'GSM2346498_A01_NT045.W5D2T0.CEL.gz',
       'GSM2346499_A01_NT056.W5D2T0.CEL.gz',
       'GSM2346500_A01_NT066.W5D2T0.CEL.gz',
       'GSM2346501_A01_NT077.W5D2T0.CEL.gz',
       ...
       'GSM2347705_H08_NT041.W5D2T120.CEL.gz',
       'GSM2347706_H08_NT141.W5D2T120.CEL.gz',
       'GSM2347707_H08_NT142.W5D2T120.CEL.gz',
       'GSM2347709_H09_NT041.W18D2T0.CEL.gz',
       'GSM2347710_H09_NT141.W18D2T0.CEL.gz',
       'GSM2347711_H09_NT142.W18D2T0.CEL.gz',
       'GSM2347712_H10_NT022.W18D3T120.CEL.gz',
       'GSM2347721_H12_NT041.W18D2T120.CEL.gz',
       'GSM2347722_H12_NT141.W18D2T120.CEL.gz',
       'GSM2347723_H12_NT142.W18D2T120.CEL.gz'],
      dtype='object', length=547)
Filtered data for W5D2T0 saved to /Users/douglas/Libra

In [2]:
import pandas as pd


# Load the corrected gene expression data
expression_data_path ='/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/Week 9 GEO TAR for D3T240 /gene_symbol_expression_data.txt'
expression_data = pd.read_csv(expression_data_path, sep="\t")


print(expression_data.head())

   PROBEID     SYMBOL                                           GENENAME  \
0  7896738     OR4G2P  olfactory receptor, family 4, subfamily G, mem...   
1  7896740      OR4F4  olfactory receptor, family 4, subfamily F, mem...   
2  7896742  LOC728323                          uncharacterized LOC728323   
3  7896744     OR4F29  olfactory receptor, family 4, subfamily F, mem...   
4  7896746      MT-TM            mitochondrially encoded tRNA methionine   

   GSM2346604_A09_NT001.W5D3T0.CEL.gz  GSM2346605_A09_NT010.W5D3T0.CEL.gz  \
0                            2.137871                            2.050746   
1                            2.694708                            2.601921   
2                            8.702197                            8.110280   
3                            3.783955                            3.454431   
4                            9.230565                            8.885978   

   GSM2346606_A09_NT018.W5D3T0.CEL.gz  GSM2346607_A09_NT025.W5D3T0.CEL.gz  \
0  

In [5]:
# Drop the columns 'PROBEID' and 'GENENAME'
df = expression_data.drop(columns=['PROBEID', 'GENENAME'])

print (df.head())
# Save the updated dataframe to a new file or overwrite the existing file
df.to_csv('gene_symbol_expression_data.txt', sep='\t', index=False)

      SYMBOL  GSM2346604_A09_NT001.W5D3T0.CEL.gz  \
0     OR4G2P                            2.137871   
1      OR4F4                            2.694708   
2  LOC728323                            8.702197   
3     OR4F29                            3.783955   
4      MT-TM                            9.230565   

   GSM2346605_A09_NT010.W5D3T0.CEL.gz  GSM2346606_A09_NT018.W5D3T0.CEL.gz  \
0                            2.050746                            2.050834   
1                            2.601921                            2.613024   
2                            8.110280                            8.615337   
3                            3.454431                            2.857110   
4                            8.885978                            9.322546   

   GSM2346607_A09_NT025.W5D3T0.CEL.gz  GSM2346608_A09_NT034.W5D3T0.CEL.gz  \
0                            2.147566                            2.114126   
1                            2.285178                            2.653

In [6]:
#run this section to implement the three functions below: 
#note we will only be using the third function Recon2_GeneExpression_File

def Make_Recon2_List(file_name):
    #short function just to open and format the recon2 gene names  in the recon2_gene.txt file into a list
    
    #open file
    model_file=open(file_name,'r')
    #read lines
    model_lines=model_file.readlines()
    #for each line/gene remove unwanted features
    retain_genes=[]
    for gene in model_lines:
        gene_string=gene.strip("\n")
        gene_name=gene_string.strip("''")
        retain_genes.append(gene_name)
    #return list of gene names 
    return retain_genes

def Make_Gene_Conversion_Dic(vmh_file,model_file):
    #short function of make a dictionary that converts gene symbol to gene ID for recon2
    #take in two .txt files:
    #the vmh_gene_convert.txt file which contains all gene conversions for recon3
    #the recon2_gene.txt file which contains all genes in the recon2 model you are using
    #the function returns a dictionary where the gene symbols are keys
    
    
    #make list of recon2 genes
    retain_genes=Make_Recon2_List(model_file)
    #open vmh file
    vmh_genelist=open(vmh_file,'r')
    vmh_lines=vmh_genelist.readlines()
    #read in each line of vmh file
    gene_conversion={}
    for entry in vmh_lines:
        gene_info=entry.split("\t")
        #if the gene identifier is in our recon list we add it to our conversion dictionary
        if gene_info[0] in retain_genes:
            gene_label=gene_info[1]
            gene_symbol=gene_label.strip("\n")
            gene_conversion.update({gene_symbol:gene_info[0]})
    #return conversion dictionary
    return gene_conversion

def Recon2_GeneExpression_File(expression_data_file,vmh_gene_conversion,recon2_gene_list,outfile):
    #function to extract only the gene expression values that can be mapped to recon2
    #function takes in three .txt files":
    #expression_data_file: your gene expression data in .txt format structured to only have gene symbol as an identifier
    #vmh_gene_conversion: vmh_gene_convert.txt file which contains all gene conversions for recon3
    #recon2_gene_list: recon2_gene.txt file which contains all genes in the recon2 model you are using
    
    #make your identifier conversion dictionary
    conversion_dic=Make_Gene_Conversion_Dic(vmh_gene_conversion,recon2_gene_list)
    
    #open gene expression values
    exprs_file=open(expression_data_file,'r')
    gene_exprs=exprs_file.readlines()
    
    out_list=[gene_exprs[0]]
    gene_match=[]
    #for each row/gene in the gene expression file, check in the gene symbol is in the conversion dic
    #if it is, then replace the gene symbol with the HUGO gene identifier and store in outlist
    exprs_file=open(expression_data_file,'r')
    gene_exprs=exprs_file.readlines()[1:]
    for gene in gene_exprs:
        symbol_exprs=gene.split("\t",1)
        if symbol_exprs[0] in conversion_dic:
            gene_match.append(symbol_exprs[0])
            new_ID=conversion_dic[symbol_exprs[0]]
            new_exprs=new_ID+"\t"+symbol_exprs[1]
            out_list.append(new_exprs)
    #open file to read out data        
    o_file=open(outfile,'w')
    for item in out_list:
        o_file.write(item)
    return gene_match

In [7]:
matched_genes=Recon2_GeneExpression_File('gene_symbol_expression_data.txt','vmh_gene_convert.txt','recon3_genes.txt','recon3_ID_expression.txt')







In [8]:
matched_genes

['B3GALT6',
 'PLCH2',
 'CA6',
 'H6PD',
 'PIK3CD',
 'NMNAT1',
 'PGD',
 'PLA2G5',
 'PLA2G2F',
 'CDA',
 'HS6ST1',
 'ALPL',
 'SDHD',
 'EXTL1',
 'PIGV',
 'AZIN2',
 'NDUFS5',
 'OXCT2',
 'CTPS1',
 'PPCS',
 'ST3GAL3',
 'ATP6V0B',
 'B4GALT2',
 'UROD',
 'AKR1A1',
 'UQCRH',
 'CYP4B1',
 'CYP4X1',
 'CYP4Z1',
 'CMPK1',
 'SLC5A9',
 'GPX7',
 'SCP2',
 'CPT2',
 'DIO1',
 'ACOT11',
 'FGGY',
 'PGM1',
 'AK4',
 'PDE4B',
 'CTH',
 'FPGT',
 'ACADM',
 'ST6GALNAC3',
 'ST6GALNAC5',
 'AK5',
 'HS2ST1',
 'ABCD3',
 'AGL',
 'SLC35A3',
 'AMY2B',
 'AMY2A',
 'AMY2A',
 'AMY2A',
 'AMPD2',
 'GSTM2',
 'GSTM1',
 'AHCYL1',
 'CEPT1',
 'CHIA',
 'ATP5F1',
 'ATP1A1',
 'HAO2',
 'HSD3B2',
 'HSD3B1',
 'PHGDH',
 'CA14',
 'SETDB1',
 'PIP5K1A',
 'NPR1',
 'SLC27A3',
 'AQP10',
 'ATP8B2',
 'FLAD1',
 'FDPS',
 'RHBG',
 'ATP1A2',
 'ATP1A4',
 'PPOX',
 'NDUFS2',
 'SDHC',
 'UAP1',
 'HSD17B7P2',
 'MGST3',
 'UCK2',
 'ATP1B1',
 'PRDX6',
 'FAM20B',
 'SOAT1',
 'PLA2G4A',
 'ATP2B4',
 'PFKFB2',
 'HSD11B1',
 'FLVCR1',
 'MIA3',
 'DEGS1',
 'GUK1',
 'GALNT2

In [9]:
import pandas as pd


# Load the corrected gene expression data
expression_data_path ='recon3_ID_expression.txt'
expression_data = pd.read_csv(expression_data_path, sep="\t")


print(expression_data.head())

     SYMBOL  GSM2346604_A09_NT001.W5D3T0.CEL.gz  \
0  126792.1                            5.845286   
1    9651.1                            5.924237   
2     765.1                            3.652925   
3    9563.1                            6.811685   
4    5293.1                            8.363492   

   GSM2346605_A09_NT010.W5D3T0.CEL.gz  GSM2346606_A09_NT018.W5D3T0.CEL.gz  \
0                            5.753453                            5.491933   
1                            5.701172                            5.754117   
2                            4.654936                            3.579182   
3                            6.572158                            6.789779   
4                            8.395187                            8.223681   

   GSM2346607_A09_NT025.W5D3T0.CEL.gz  GSM2346608_A09_NT034.W5D3T0.CEL.gz  \
0                            5.811422                            5.524017   
1                            5.616320                            5.911715   

In [11]:
# Print all column names
print("Column Names:")
print(expression_data.columns)

Column Names:
Index(['SYMBOL', 'GSM2346604_A09_NT001.W5D3T0.CEL.gz',
       'GSM2346605_A09_NT010.W5D3T0.CEL.gz',
       'GSM2346606_A09_NT018.W5D3T0.CEL.gz',
       'GSM2346607_A09_NT025.W5D3T0.CEL.gz',
       'GSM2346608_A09_NT034.W5D3T0.CEL.gz',
       'GSM2346609_A09_NT045.W5D3T0.CEL.gz',
       'GSM2346610_A09_NT056.W5D3T0.CEL.gz',
       'GSM2346611_A09_NT066.W5D3T0.CEL.gz',
       'GSM2346612_A09_NT077.W5D3T0.CEL.gz',
       ...
       'GSM2347614_G12_NT064.W5D3T240.CEL.gz',
       'GSM2347615_G12_NT076.W5D3T240.CEL.gz',
       'GSM2347616_G12_NT084.W5D3T240.CEL.gz',
       'GSM2347617_G12_NT106.W5D3T240.CEL.gz',
       'GSM2347618_G12_NT125.W5D3T240.CEL.gz',
       'GSM2347619_G12_NT133.W5D3T240.CEL.gz',
       'GSM2347621_G12_NT148.W5D3T240.CEL.gz',
       'GSM2347648_H03_NT022.W5D3T0.CEL.gz',
       'GSM2347674_H05_NT022.W5D3T120.CEL.gz',
       'GSM2347687_H06_NT022.W5D3T240.CEL.gz'],
      dtype='object', length=204)


In [13]:
# Code to save gene expression data of different types
import pandas as pd

# Load the gene expression data
expression_data_path = 'recon3_ID_expression.txt'
expression_data = pd.read_csv(expression_data_path, sep="\t")

# Display all column names
print("Column Names:")
print(expression_data.columns)

# Function to filter and save data for a specific sample type
def filter_and_save_data(sample_type):
    columns_to_keep = ['SYMBOL']
    for column in expression_data.columns:
        if column.endswith(f"{sample_type}.CEL.gz"):
            columns_to_keep.append(column)
    
    filtered_expression_data = expression_data[columns_to_keep]
    
    output_path = f'/Users/douglas/Library/CloudStorage/OneDrive-UniversityCollegeCork/Msc Bioinformatics and Computational Biology/MB6303 Dissertion in Bioinformatics & Computational Biology/Practical Work/Week 9 GEO TAR for D3T240 /filtered_expression_data_{sample_type}.txt'
    filtered_expression_data.to_csv(output_path, sep="\t", index=False)
    
    print(f"Filtered data for {sample_type} saved to {output_path}")
    print(filtered_expression_data.head())

# Filter and save data for each sample type
sample_types = ["W5D3T0", "W5D3T120", "W5D3T240", "W5D2T240"]
for sample_type in sample_types:
    filter_and_save_data(sample_type)


Column Names:
Index(['SYMBOL', 'GSM2346604_A09_NT001.W5D3T0.CEL.gz',
       'GSM2346605_A09_NT010.W5D3T0.CEL.gz',
       'GSM2346606_A09_NT018.W5D3T0.CEL.gz',
       'GSM2346607_A09_NT025.W5D3T0.CEL.gz',
       'GSM2346608_A09_NT034.W5D3T0.CEL.gz',
       'GSM2346609_A09_NT045.W5D3T0.CEL.gz',
       'GSM2346610_A09_NT056.W5D3T0.CEL.gz',
       'GSM2346611_A09_NT066.W5D3T0.CEL.gz',
       'GSM2346612_A09_NT077.W5D3T0.CEL.gz',
       ...
       'GSM2347614_G12_NT064.W5D3T240.CEL.gz',
       'GSM2347615_G12_NT076.W5D3T240.CEL.gz',
       'GSM2347616_G12_NT084.W5D3T240.CEL.gz',
       'GSM2347617_G12_NT106.W5D3T240.CEL.gz',
       'GSM2347618_G12_NT125.W5D3T240.CEL.gz',
       'GSM2347619_G12_NT133.W5D3T240.CEL.gz',
       'GSM2347621_G12_NT148.W5D3T240.CEL.gz',
       'GSM2347648_H03_NT022.W5D3T0.CEL.gz',
       'GSM2347674_H05_NT022.W5D3T120.CEL.gz',
       'GSM2347687_H06_NT022.W5D3T240.CEL.gz'],
      dtype='object', length=204)
Filtered data for W5D3T0 saved to /Users/douglas/Library/