# PROTEIN CLASSIFICATION ANALYSIS

In this notebook, the ChEMBL database schema is analyzed in order to obtain the classification into protein families of the targets.
We will focus on the **SINGLE PROTEIN** targets.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.express as px
import json
import ipywidgets as widgets
from IPython.display import display
import plotly.graph_objects as go
import matplotlib.colors as mcolors
import re

%matplotlib inline

In [4]:
target_classes = {
    'cytochrome p450': 'Cytochrome P450',
    'ion channel': 'Ion Channel',
    'g protein-coupled': 'G Protein-Coupled',
    'kinase': 'Kinase',
    'toll-like': 'Toll-Like',
    'protease': 'Protease',
    'nuclear receptor': 'Nuclear Receptor',
    'methyltransferase': 'Methyltransferase',
    'demethylase': 'Demethylase',
    'histone deacetylase': 'Histone Deacetylase',
    'histone acetyltransferase': 'Histone Acetyltransferase',
    'bromodomain': 'Bromodomain',
    'protein tau': 'Protein Tau',
    'heat shock': 'Heat Shock',
    'epigenetic regulator': 'Epigenetic Regulator',
    'voltage-gated': 'Voltage-gated'
}

regex_pattern = r'\d+tm\d+'

df = pd.read_csv('/home/federica/LAB2/protein_class.csv')

## File saving 

Two separate files are generated the first **'protein_class_with'** containing only the targets belonging to the selected protein families. The second **'protein_class_without'** with all other families found in ChEMBL.

In [5]:
def assign_family(row, target_classes, regex_pattern):

    prot_class_name = str(row['protein_class_name']).lower() if 'protein_class_name' in row else ''
    pref_name = str(row['pref_name']).lower() if 'pref_name' in row else ''
    protein_class_description = str(row['protein_class_description']).lower() if 'protein_class_description' in row else ''
    
    combined_text = ' '.join([prot_class_name, pref_name, protein_class_description])

    for target_class in target_classes:
        if re.search(r'\b' + re.escape(target_class.lower()) + r'\b', combined_text):
            return target_class
    
    if re.search(regex_pattern, combined_text):
        return 'g protein-coupled'

    return None  

df['family'] = df.apply(lambda row: assign_family(row, target_classes, regex_pattern), axis=1)
df['type_activity'] = df['protein_class_description'].apply(lambda x: 'writer' if 'writer' in x 
                                                            else 'eraser' if 'eraser' in x 
                                                            else 'reader' if 'reader' in x
                                                            else 'other')
df_filtered = df[df['family'].notna()].copy()  
df_filtered.loc[df_filtered['family'] == 'voltage-gated', 'family'] = 'ion channel' 
df_sorted = df_filtered.sort_values(by=['family', 'type_activity', 'protein_class_description', 'pref_name'], ascending=[True, True, True, True])
columns_order = ['family', 'protein_class_description', 'protein_class_name', 'pref_name', 'chembl_id']
df_final = df_sorted[columns_order]
df_final.to_csv('/home/federica/LAB2/family_protein_with.csv', index=False)


In [6]:
df_cleaned = df[~df['chembl_id'].isin(df_filtered['chembl_id'])].copy()
df_cleaned=df_cleaned[['protein_class_description', 'protein_class_name', 'pref_name', 'chembl_id']]
df_cleaned.sort_values(by='protein_class_description', ascending=True, inplace=True)
df_cleaned.to_csv('/home/federica/LAB2/family_protein_without.csv', index=False)

## Creating statistics for classification into protein families

Once the mutations are studied and the data are divided according to quality, statistical analyses are performed regarding the membership of molecules in protein families.

In [7]:
def load_and_concat(folder_path):

    dataframes = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            dataframes.append(df)

    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df

In [8]:
#wild type files
wild_type_1 = '/home/federica/LAB2/data_202410161138_protein/wild_type/wild_type_1'
wild_type_2 = '/home/federica/LAB2/data_202410161138_protein/wild_type/wild_type_2'
wild_type_3 = '/home/federica/LAB2/data_202410161138_protein/wild_type/wild_type_3'

#mutantion target files
mutation_target_1 = '/home/federica/LAB2/data_202410161138_protein/mutation_target/mutation_target_1'
mutation_target_2 = '/home/federica/LAB2/data_202410161138_protein/mutation_target/mutation_target_2'
mutation_target_3 = '/home/federica/LAB2/data_202410161138_protein/mutation_target/mutation_target_3'

#mixed files
mixed_1 = '/home/federica/LAB2/data_202410161138_protein/mixed/mixed_1'
mixed_2 = '/home/federica/LAB2/data_202410161138_protein/mixed/mixed_2'
mixed_3 = '/home/federica/LAB2/data_202410161138_protein/mixed/mixed_3'

In [9]:
wild_type_1 = load_and_concat(wild_type_1)
wild_type_2 = load_and_concat(wild_type_2)
wild_type_3 = load_and_concat(wild_type_3)

mutation_target_1 = load_and_concat(mutation_target_1)
mutation_target_2 = load_and_concat(mutation_target_2)
mutation_target_3 = load_and_concat(mutation_target_3)

mixed_1 = load_and_concat(mixed_1)
mixed_2 = load_and_concat(mixed_2)
mixed_3 = load_and_concat(mixed_3)


In [10]:
def calculate_protein_stats(df, target_classes, molecule_column='Molecule ChEMBL ID'):
    rows = []
    for protein_class in target_classes:
        class_df = df[
            (df['pref_name'].str.contains(protein_class, case=False, na=False)) |
            (df['protein_class_name'].str.contains(protein_class, case=False, na=False)) |
            (df['protein_class_description'].str.contains(protein_class, case=False, na=False))
        ]

        if not class_df.empty:
            
            class_count = class_df.shape[0]
            target_count = class_df['Target ChEMBL ID'].nunique()
            
            if molecule_column in class_df.columns:
                molecule_count = class_df[molecule_column].nunique()
            else:
                molecule_count = None
            
            class_targets = class_df['Target ChEMBL ID'].unique().tolist()
            
            rows.append({
                'protein_class_name': protein_class,
                'count': class_count,
                'target_count': target_count,
                'molecule_count': molecule_count,
                'targets': class_targets
            })
    
    stats_df = pd.DataFrame(rows)
    
    return stats_df

In [11]:

wild_type_1_stats = calculate_protein_stats(wild_type_1, target_classes)
wild_type_2_stats = calculate_protein_stats(wild_type_2, target_classes)
wild_type_3_stats = calculate_protein_stats(wild_type_3, target_classes)

mutation_target_1_stats = calculate_protein_stats(mutation_target_1, target_classes)
mutation_target_2_stats = calculate_protein_stats(mutation_target_2, target_classes)
mutation_target_3_stats = calculate_protein_stats(mutation_target_3, target_classes)

mixed_1_stats = calculate_protein_stats(mixed_1, target_classes)
mixed_2_stats = calculate_protein_stats(mixed_2, target_classes)
mixed_3_stats = calculate_protein_stats(mixed_3, target_classes)


In [12]:
def create_interactive_table(df, title="Interactive Table"):
    fig = go.Figure(data=[go.Table(
        header=dict(values=list(df[['Group', 'protein_class_name', 'target_count', 'molecule_count']].columns),
                    fill_color='paleturquoise',
                    align='left'),
        cells=dict(values=[df[col] for col in df.columns if col != 'targets'and col != 'count'],
                   fill_color='lavender',
                   align='left'))
    ])

    fig.update_layout(
        title=title,
        title_x=0.5
    )

    fig.show()

In [13]:
wild_type_1_stats['Group'] = 'Wild Type 1'
wild_type_2_stats['Group'] = 'Wild Type 2'
wild_type_3_stats['Group'] = 'Wild Type 3'

mutation_target_1_stats['Group'] = 'Mutation Target 1'
mutation_target_2_stats['Group'] = 'Mutation Target 2'
mutation_target_3_stats['Group'] = 'Mutation Target 3'

mixed_1_stats['Group'] = 'Mixed 1'
mixed_2_stats['Group'] = 'Mixed 2'
mixed_3_stats['Group'] = 'Mixed 3'

combined_df = pd.concat([
    wild_type_1_stats, wild_type_2_stats, wild_type_3_stats,
    mutation_target_1_stats, mutation_target_2_stats, mutation_target_3_stats,
    mixed_1_stats, mixed_2_stats, mixed_3_stats
], ignore_index=True)

cols = ['Group'] + [col for col in combined_df.columns if col != 'Group']
combined_df = combined_df[cols]

combined_df['Group'] = combined_df['Group'].where(combined_df['Group'].ne(combined_df['Group'].shift()),"")

In [None]:
create_interactive_table(combined_df, title="Combined Protein Class Statistics")

## Table with activity data and mutation information

In [16]:
protein = pd.read_csv('/home/federica/LAB2/protein_class.csv')
whole_data = pd.read_csv('/home/federica/LAB2/data_202410251558/filtered/whole_dataset_out.csv')

In [17]:
merged_data = pd.merge(
    protein[['protein_class_name','protein_class_description', 'pref_name', 'chembl_id']],
    whole_data[['Molecule ChEMBL ID', 'Target ChEMBL ID', 'mutant', 'Quality', 'Class']],
    left_on='chembl_id',
    right_on='Target ChEMBL ID',
    how='left'
)

merged_data = merged_data.dropna(subset=['Target ChEMBL ID'])

In [None]:
merged_data.columns

In [19]:
new_dataframe = pd.DataFrame(columns=[
    'Protein class description', 'Target ChEMBL ID', 'Protein class name', 'Preferred target name',
    'Mutation', 'Inactive', 'Active', 'Mod active', 
    'First quality inactive', 'Second quality inactive', 'Third quality inactive',
    'First quality active', 'Second quality active', 'Third quality active',
    'First quality mod active', 'Second quality mod active', 'Third quality mod active'
])

In [20]:
import math
def generate_statistics_with_splitting(merged_data):
    results = []
    for (target_id, mutant), group in merged_data.groupby(['Target ChEMBL ID', 'mutant']):
        entry_template = {
            'Protein class description': group['protein_class_description'].iloc[0],
            'Target ChEMBL ID': target_id,
            'Protein class name': group['protein_class_name'].iloc[0],
            'Preferred target name': group['pref_name'].iloc[0],
            'Inactive': 0, 'Active': 0, 'Mod active': 0,
            'First quality inactive': 0, 'Second quality inactive': 0, 'Third quality inactive': 0,
            'First quality active': 0, 'Second quality active': 0, 'Third quality active': 0,
            'First quality mod active': 0, 'Second quality mod active': 0, 'Third quality mod active': 0
        }
        total_counts = {'Inactive': 0, 'Active': 0, 'Mod active': 0}
        quality_counts = {'Inactive': [0, 0, 0], 'Active': [0, 0, 0], 'Mod active': [0, 0, 0]}

        for _, row in group.iterrows():
            if pd.notna(row['Class']) and isinstance(row['Class'], (int, float)):
                class_type = int(row['Class'])  # Convert to int
                quality_type = int(row['Quality']) if pd.notna(row['Quality']) else 1
                class_name = ['Inactive', 'Active', 'Mod active'][class_type]
                total_counts[class_name] += 1
                quality_counts[class_name][quality_type - 1] += 1
        if ';' in mutant:
            mutations = mutant.split(';')
            for mut in mutations:
                entry = entry_template.copy()
                entry['Mutation'] = mut
                
                for class_name, total in total_counts.items():
                    split_count = math.ceil(total / len(mutations))
                    entry[class_name] = split_count
                    
                    for i, quality_count in enumerate(quality_counts[class_name]):
                        quality_split = math.ceil(quality_count / len(mutations))
                        entry[f"{['First', 'Second', 'Third'][i]} quality {class_name.lower()}"] = quality_split
                results.append(entry)
        else:
            entry = entry_template.copy()
            entry['Mutation'] = mutant
            for class_name in total_counts:
                entry[class_name] = total_counts[class_name]
                for i in range(3):
                    entry[f"{['First', 'Second', 'Third'][i]} quality {class_name.lower()}"] = quality_counts[class_name][i]
            results.append(entry)

    return pd.DataFrame(results)

In [21]:
final_df = generate_statistics_with_splitting(merged_data)
final_df.to_csv('/home/federica/LAB2/final_results.csv', index=False)

In [22]:
def assign_family(row, target_classes):
    for target_class in target_classes:
        if (target_class in str(row['Protein class name']) or
            target_class in str(row['Preferred target name']) or
            target_class in str(row['Protein class description'])):
            return target_class
    return None 

# Forziamo l'ordine delle colonne nel DataFrame finale
final_df = final_df[new_dataframe.columns]

final_df['Family'] = final_df.apply(lambda row: assign_family(row, target_classes), axis=1)

cols = ['Family'] + [col for col in final_df.columns if col != 'Family']
final_df = final_df[cols]

sorted_dataframe = final_df.sort_values(by=['Family', 'Protein class name', 'Preferred target name'], ascending=True)

sorted_dataframe.to_csv('/home/federica/LAB2/sorted_new.csv', index=False)

In [23]:
def create_table(df, title="Interactive Table"):
    fig = go.Figure(data=[go.Table(
        header=dict(values=list(df.columns),
                    fill_color='paleturquoise',
                    align='left'),
        cells=dict(values=[df[col] for col in df.columns ],
                   fill_color='lavender',
                   align='left'))
    ])

    fig.update_layout(
        title=title,
        title_x=0.5
    )

    fig.show()

In [None]:
df=pd.read_csv('/home/federica/LAB2/sorted_new.csv').head(10)
create_table(df,'activity files and protein families')

In [39]:
df = pd.read_csv('/home/federica/LAB2/prova_accession.csv')
df1=df1 = pd.read_csv('/home/federica/LAB2/uniprotkb_reviewed_true_AND_model_organ_2024_10_26.tsv.gz', sep='\t', compression='gzip',low_memory=False)

df1.rename(columns={'Entry': 'accession_code'}, inplace=True)
df['accession_code'] = df['accession_code'].astype(str).str.strip().str.lower()
df1['accession_code'] = df1['accession_code'].astype(str).str.strip().str.lower()

df2 = df.merge(df1, how='left', on='accession_code')

df3 = df2[['tid','chembl_id', 'ChEMBL', 'accession_code',
            'target_name', 'protein_class_name',
            'protein_class_description', 'Protein names', 'Protein families', ]]

In [40]:
df3.to_csv('/home/federica/LAB2/prova11.csv')