# Compute common genes

_Miglioramenti_ (tempo computazionale):
 * In Gene_Importances, un solo file per ogni strain contenente tutte le importance di ciascun test.
 * CommonGenes calcola una sola volta i geni comuni e prende volta per volta la giusta importance dell'attuale test.

In [101]:
import os, re, sys, math
import pandas as pd
import numpy as np
from collections import Counter


# Import custom libraries.
sys.path.append('Code/')
from dataset import load_and_preprocess
from utility import clean_ds_store

clean_ds_store()

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# nltk.download('punkt')
# nltk.download('stopwords')

In [99]:
def simplify_product_name(product_name, common_words):
    """
        Preprocess the gene product string.
    """
    words = word_tokenize(re.sub(r'[^\w\s]', ' ', product_name.lower()))  # Rimuovi caratteri speciali e tokenizza
    
    word_freq = Counter(words)
    
    unique_words = list(word_freq.keys())
    
    filtered_words = [word for word in unique_words if not word.startswith('fig') and word not in common_words]

    return ' '.join(filtered_words)                                       # Ritorna le parole rimanenti come una stringa


def common_genes(path):
    """
        Analyzes Excel files in a specified directory and aggregates gene information based on their 'Product'.    
    """
    
    genes = dict()
    common_words = stop_words.union({'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '2c'})

    if os.path.isdir(path): # Path is a specific carbohydrate
        for file in os.listdir(path):
            # print(file, end='\t')
            
            if file.endswith('.xlsx'):
                strain_genes = pd.read_excel(os.path.join(path, file)) #.set_index('Sequence')

                for index, row in strain_genes.iterrows():
                    if row['Type'] != 'N':
                        if isinstance(row['Product'], str) and row['Product'] != 'N/A':
                            product = re.sub(r'\([^)]*\)', '', row['Product'])
                            product = re.sub(r'/', '-', product) # Alcuni product come 'spermidine/putrescine' potrebbero creare problemi durante il salvataggio.
                            product = simplify_product_name(product, common_words)

                            data = [[file[:-5], row['Locus tag'], row['Helix'], row['Start'], row['End'], row['Importance']]]
                            data = pd.DataFrame(data, columns=['Strain', 'Locus tag', 'Helix', 'Start', 'End', 'Importance'])

                        if product in genes:
                            genes[product] = pd.concat([genes[product], data], ignore_index=True)
                        else:
                            genes[product] = data
                            
    return genes

In [102]:
# _, y = load_and_preprocess()

output = pd.DataFrame(columns=['Gene family', 'Positive importance', 'Negative importance', 'Ratio'])
path = 'Result/Coverages/Gene_Importances/'
output_path = 'Result/Coverages/Common_Genes/'

y.columns = y.columns.str.strip() # Rimuove eventuali spazi vuoti

for carbohydrate in os.listdir(path):
    
    print(f'Calculating common genes for carbohydrate {carbohydrate}.', end=' ')
    
    test_path = os.path.join(path, carbohydrate)
    rows = []
    
    positive_strain = y[y[carbohydrate] == 1].index
    negative_strain = y[y[carbohydrate] == 0].index
    positive_importances_mean = 0
    negative_importances_mean = 0
    
    genes = common_genes(test_path)
    print(f"Genes foundend: {len(genes)}")
    
    # save_common_genes(genes, carbohydrate)
    
    for gene_family, dataframe in genes.items():        
        positive_rows = dataframe[dataframe['Strain'].isin(positive_strain)]
        negative_rows = dataframe[dataframe['Strain'].isin(negative_strain)]
        ratio = None
        
        # Only gene families with an importance are considered.
        p = positive_rows['Importance'].mean()
        n = negative_rows['Importance'].mean()
        
        # if p is not None and n is not None:
            
        ratio = p/n
        A = p / math.sqrt(p**2 + n**2) # Calculation of the cosine between the point (p, n) and the origin
        B = 1 / math.sqrt(1**2 + 1**2) # Calculation of the cosine between the point (1, 1) and the origin
        diff_AB = B - A
             
        row = {
            'Gene family': gene_family, 
            'Positive importance': positive_rows['Importance'].mean(), 
            'Negative importance': negative_rows['Importance'].mean(), 
            'Ratio': ratio,
            'A': A,
            'B': B,
            'B-A': diff_AB
        }
        
        # display(pd.DataFrame([row]))
                
        rows.append(row)

    output = pd.DataFrame(rows).sort_values(by=['Positive importance','Negative importance'], ascending=[False,False])
    
    with pd.ExcelWriter(f'{os.path.join(output_path, carbohydrate)}.xlsx') as writer:
        output.to_excel(writer, index=False)
    
print(f'\nCommon genes importances saved in {output_path}')

Calculating common genes for carbohydrate D-MELibiose. Genes foundend: 16803
Calculating common genes for carbohydrate D-MANnitol. Genes foundend: 16803
Calculating common genes for carbohydrate D-MaNnosE. Genes foundend: 16069
Calculating common genes for carbohydrate ARButin. Genes foundend: 16803
Calculating common genes for carbohydrate D-RAFfinose. Genes foundend: 16803
Calculating common genes for carbohydrate D-CELlobiose. Genes foundend: 16803
Calculating common genes for carbohydrate D-RIBose. Genes foundend: 16783
Calculating common genes for carbohydrate L-ARAbinose. Genes foundend: 16803
Calculating common genes for carbohydrate N-AcetylGlucosamine. Genes foundend: 16803
Calculating common genes for carbohydrate D-LACtose (bovine origin). Genes foundend: 16766
Calculating common genes for carbohydrate D-XYLose. Genes foundend: 16803
Calculating common genes for carbohydrate D-SACcharose (sucrose). Genes foundend: 16803
Calculating common genes for carbohydrate D-GALactose. 

## Esempio stringhe da preprocessare

In [None]:
products = [
    '(2E%2C6E)-farnesyl diphosphate synthase',
    '(2E%2C6E)-farnesyl- diphosphate-specific ditrans%2Cpolycis-undecaprenyl-diphosphate synthase',
    '(2Fe-2S)-binding protein',
    '(2Fe-2S)-binding protein',
    '(S)-ureidoglycine aminohydrolase',
    '[acyl-carrier-protein] S-malonyltransferase',
    '[protein-PII] uridylyltransferase',
    '1-(5-phosphoribosyl)-5-[(5-phosphoribosylamino)methylideneamino]imidazole-4-carboxamide isomerase',
    '1-acyl-sn-glycerol-3-phosphate acyltransferase',
    '1-acyl-sn-glycerol-3-phosphate acyltransferase',
    '1-deoxy-D-xylulose-5-phosphate reductoisomerase',
    '1-deoxy-D-xylulose-5-phosphate synthase',
    '1-phosphofructokinase',
    '1%2C4-alpha-glucan branching enzyme',
    '16S rRNA (adenine(1518)-N(6)/adenine(1519)-N(6))-dimethyltransferase'
]

# Old version

In [None]:
def process_string(stringa):   
            
    if stringa.startswith('[') or stringa.startswith('('):       
        index_round_end = stringa.find(')')
        index_square_end = stringa.find(']')

        index = min(index_round_end, index_square_end) if index_round_end != -1 and index_square_end != -1 else max(index_round_end, index_square_end) + 1

        stringa = stringa[index:]
        stringa = process_string(stringa)
        return stringa
    
    else:
        index_round_start = stringa.find('(')
        index_square_start = stringa.find('[')

        index = min(index_round_start, index_square_start) if index_round_start != -1 and index_square_start != -1 else max(index_round_start, index_square_start) + 1 - 1
        
        if index != -1:
            stringa = stringa[:index]
            return stringa
        else:
            return stringa
               

def save_common_genes(genes, carbohydrate_name):
    path = f'Result/Coverages/Common_Genes/{carbohydrate_name}/'
    
    if not os.path.exists(f'{path}'):
        os.makedirs(path)

    for gene_family, dataframe in genes.items(): 
        
        if len(gene_family) >= 250:
            gene_family = process_string(gene_family)
                
        with pd.ExcelWriter(f'{os.path.join(path, gene_family)}.xlsx') as writer:
            dataframe.to_excel(writer, index=False)
        

        
def common_genes(path):
    
    genes = dict()

    if os.path.isdir(path):
        for file in os.listdir(path):
            
            if file.endswith('.xlsx'):
                strain_genes = pd.read_excel(os.path.join(path, file)).set_index('Sequence')

                for index, row in strain_genes.iterrows():
                    if row['Importance'] > 0:

                        if isinstance(row['Product'], str) and row['Product'] != 'N/A':
                            product = re.sub(r'\([^)]*\)', '', row['Product'])
                            product = re.sub(r'/', '-', product) # Alcuni product come 'spermidine/putrescine' potrebbero creare problemi durante il salvataggio.
                            product = product.lower()
                            data = [[file[:-5], row['Locus tag'], row['Helix'], row['Start'], row['End'], row['Importance']]]
                            data = pd.DataFrame(data, columns=['Strain', 'Locus tag', 'Helix', 'Start', 'End', 'Importance'])

                        if product in genes:
                            genes[product] = pd.concat([genes[product], data], ignore_index=True)
                        else:
                            genes[product] = data
                            
    return genes





_, y = load_and_preprocess()

output = pd.DataFrame(columns=['Gene family', 'Positive importance', 'Negative importance', 'Ratio'])
path = 'Result/Coverages/Gene_Importances/'
output_path = 'Result/Coverages/Common_Genes/'

y.columns = y.columns.str.strip() # Rimuove eventuali spazi vuoti

for carbohydrate in os.listdir(path):
    
    test_path = os.path.join(path, carbohydrate)
    rows = []
    
    positive_strain = y[y[carbohydrate] == 1].index
    negative_strain = y[y[carbohydrate] == 0].index
    positive_importances_mean = 0
    negative_importances_mean = 0
    
    print(f'Calculating common genes for carbohydrate {carbohydrate}...', end='\t')
    genes = common_genes(test_path)
    print('common genes founded.')
    
    # save_common_genes(genes, carbohydrate)
    
    for gene_family, dataframe in genes.items():        
        positive_rows = dataframe[dataframe['Strain'].isin(positive_strain)]
        negative_rows = dataframe[dataframe['Strain'].isin(negative_strain)]
        ratio = None
        
        # Only gene with an importance are considered.
        if negative_rows['Importance'].mean() > 0:
            p = positive_rows['Importance'].mean()
            n = negative_rows['Importance'].mean()
            
            ratio = p/n
            A = p / math.sqrt(p**2 + n**2) # Calculation of the cosine between the point (p, n) and the origin
            B = 1 / math.sqrt(1**2 + 1**2) # Calculation of the cosine between the point (1, 1) and the origin
            diff_AB = B - A
             
        row = {
            'Gene family': gene_family, 
            'Positive importance': positive_rows['Importance'].mean(), 
            'Negative importance': negative_rows['Importance'].mean(), 
            'Ratio': ratio,
            'A': A,
            'B': B,
            'B-A': diff_AB
        }
        
        rows.append(row)

    output = pd.DataFrame(rows).sort_values(by=['B-A','Positive importance'], ascending=[False,False])
    
    with pd.ExcelWriter(f'{os.path.join(output_path, carbohydrate)}.xlsx') as writer:
        output.to_excel(writer, index=False)
    
print(f'Common genes importances saved in {output_path}')