In [1]:
# WEEK 1: GENE ANALYZER (ALBERTO WICKER VERA)

In [2]:
# Importing Necesary Libraries
import os
import hashlib
import pandas as pd

In [13]:
# Loading gene_info file
file_path = os.path.expanduser('/Users/albertwv/Downloads/gene_info')

In [18]:
def compute_md5(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(9606), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [19]:
def process_gene_info(file_path, chunk_size=500000):
    gene_counter = set()
    homo_sapiens_genes = set()
    gene_types = set()
    gene_type_counts = {}

    for chunk in pd.read_csv(file_path, sep='\t', chunksize=chunk_size, low_memory=False):
        # Update gene counters
        gene_counter.update(chunk['GeneID'])

        # Handling NaN values in 'description'
        chunk_filtered = chunk.dropna(subset=['description'])
        #homo_sapiens_genes.update(chunk_filtered[chunk_filtered['description'].str.contains('Homo Sapiens')]['GeneID'])
        homo_sapiens_genes.update(chunk_filtered[chunk_filtered['description'].str.contains('Homo Sapiens', case=False, na=False)]['GeneID'])

        # Updatinng gene types
        gene_types.update(chunk['type_of_gene'].dropna())

        # Counting gene types, handling NaNs
        for gene_type in chunk['type_of_gene'].dropna():
            gene_type_counts[gene_type] = gene_type_counts.get(gene_type, 0) + 1

    # Finding the most common gene type
    most_common_gene_type = max(gene_type_counts, key=gene_type_counts.get)

    return len(gene_counter), len(homo_sapiens_genes), gene_types, most_common_gene_type

In [20]:
# Computing MD5
md5_value = compute_md5(file_path)
print("MD5 value:", md5_value)

MD5 value: d09fea4c6462a4970a07343dc5496bf1


In [21]:
# Process gene info
total_genes, homo_sapiens_genes, gene_types, most_common_gene_type = process_gene_info(file_path)

print("Number of Listed Genes:", format(total_genes, ','))
print("Number of Listed Genes for Homo Sapiens:", format(homo_sapiens_genes, ','))
print("List of Gene Types:", ', '.join(gene_types))
print("Most Frequent Gene:", most_common_gene_type)

Number of Listed Genes: 52,966,526
Number of Listed Genes for Homo Sapiens: 8
List of Gene Types: rRNA, protein-coding, unknown, snoRNA, snRNA, tRNA, biological-region, ncRNA, pseudo, miscRNA, other, scRNA
Most Frequent Gene: protein-coding
