In [12]:
!pip install pandas numpy bio networkx scipy

Collecting scipy
  Downloading scipy-1.16.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Downloading scipy-1.16.3-cp313-cp313-win_amd64.whl (38.5 MB)
   ---------------------------------------- 0.0/38.5 MB ? eta -:--:--
    --------------------------------------- 0.5/38.5 MB 3.9 MB/s eta 0:00:10
   - -------------------------------------- 1.0/38.5 MB 3.8 MB/s eta 0:00:10
   - -------------------------------------- 1.8/38.5 MB 3.1 MB/s eta 0:00:12
   -- ------------------------------------- 2.6/38.5 MB 3.3 MB/s eta 0:00:11
   --- ------------------------------------ 3.7/38.5 MB 3.5 MB/s eta 0:00:11
   ---- ----------------------------------- 4.5/38.5 MB 3.7 MB/s eta 0:00:10
   ----- ---------------------------------- 5.5/38.5 MB 3.8 MB/s eta 0:00:09
   ------- -------------------------------- 6.8/38.5 MB 4.0 MB/s eta 0:00:08
   -------- ------------------------------- 7.9/38.5 MB 4.2 MB/s eta 0:00:08
   --------- ------------------------------ 8.9/38.5 MB 4.2 MB/s eta 0:00:07
   ---------


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import numpy as np
import networkx as nx
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from collections import Counter

## PPI Interaction Calculation

In [5]:
# Import the dataset
data_ppi=pd.read_csv("D:\\python_progs\\Final_year_proj\\Datasets\\ppi_initial.csv")

# Filter out rows that have no interaction data
data_interactions = data_ppi.dropna(subset=['Interacts with']).copy()

In [6]:
#Defining the Edge Explosion Function
def explode_interactions(row):
    protein_a = row['Entry']
    interactors = row['Interacts with'].split('; ')
    
    edges = []
    for interactor in interactors:
        clean_interactor = interactor.strip()        
        if clean_interactor:
            edges.append((protein_a, clean_interactor))            
    return edges

# Apply the function to all rows to generate the list of all edges
all_edges = data_interactions.apply(explode_interactions, axis=1).sum()

In [8]:
#Creating the Edge List DataFrame
ppi_df = pd.DataFrame(all_edges, columns=['Protein_A_Entry', 'Protein_B_Entry'])

#Removing Self-Loops
ppi_df = ppi_df[ppi_df['Protein_A_Entry'] != ppi_df['Protein_B_Entry']]

#Remove Duplicate/Reciprocal Edges
ppi_df['Interaction'] = np.minimum(ppi_df['Protein_A_Entry'], ppi_df['Protein_B_Entry']) + \
                       '_' + np.maximum(ppi_df['Protein_A_Entry'], ppi_df['Protein_B_Entry'])

#Droping duplicates based on the unique ID, keeping only one entry per unique interaction
ppi_df = ppi_df.drop_duplicates(subset=['Interaction']).drop(columns=['Interaction'])

In [10]:
#Saving the Final Network Edge List
output_file = 'D:\python_progs\Final_year_proj\Datasets\ppi_network_edges.csv'
ppi_df.to_csv(output_file, index=False)

  output_file = 'D:\python_progs\Final_year_proj\Datasets\ppi_network_edges.csv'


In [13]:
def read_edge_list(path, src_col="Protein_A_Entry", dst_col="Protein_B_Entry"):
    df = pd.read_csv(path)
    if src_col not in df.columns or dst_col not in df.columns:
        raise ValueError(f"Input CSV must contain '{src_col}' and '{dst_col}' columns.")
    
    df = df[[src_col, dst_col]].dropna()
    df = df[df[src_col] != df[dst_col]]  # remove self-loops
    print(f"Loaded {len(df)} interactions from {path}")
    return df

#Building the PPI graph
def build_graph(edges_df, undirected=True):
    if undirected:
        G = nx.from_pandas_edgelist(edges_df, source=edges_df.columns[0], target=edges_df.columns[1])
    else:
        G = nx.from_pandas_edgelist(edges_df, source=edges_df.columns[0], target=edges_df.columns[1], create_using=nx.DiGraph)
    
    print(f"Graph built: {G.number_of_nodes()} proteins, {G.number_of_edges()} interactions")
    return G

#Computing centrality measures
def compute_centralities(G):
    print("Computing centrality measures...")

    degree = dict(G.degree())
    degree_centrality = nx.degree_centrality(G)
    betweenness = nx.betweenness_centrality(G)
    closeness = nx.closeness_centrality(G)
    eigenvector = nx.eigenvector_centrality(G, max_iter=1000)
    pagerank = nx.pagerank(G)
    clustering = nx.clustering(G)

    df = pd.DataFrame({
        "Protein": list(G.nodes()),
        "Degree": pd.Series(degree),
        "Degree_Centrality": pd.Series(degree_centrality),
        "Betweenness_Centrality": pd.Series(betweenness),
        "Closeness_Centrality": pd.Series(closeness),
        "Eigenvector_Centrality": pd.Series(eigenvector)
    })

    print("Centrality measures computed successfully.")
    return df

#Saving results to CSV
def save_results(df, output_path="./ppi_centrality_results.csv"):
    df.to_csv(output_path, index=False)
    print(f"Results saved to: {output_path}")

if __name__ == "__main__":
    
    input_path = r"D:\\python_progs\\Final_year_proj\Datasets\\ppi_network_edges.csv"  
    output_path = r"D:\\python_progs\\Final_year_proj\Datasets\\ppi_centrality.csv"

    edges_df = read_edge_list(input_path)
    G = build_graph(edges_df)
    centrality_df = compute_centralities(G)
    save_results(centrality_df, output_path)

Loaded 109217 interactions from D:\\python_progs\\Final_year_proj\Datasets\\ppi_network_edges.csv
Graph built: 18227 proteins, 109217 interactions
Computing centrality measures...
Centrality measures computed successfully.
Results saved to: D:\\python_progs\\Final_year_proj\Datasets\\ppi_centrality.csv


## Physicochemical Features and Shanon Entropy

In [1]:
def calculate_aliphatic_index(sequence):
    amino_acids = Counter(sequence)
    A = amino_acids.get('A', 0)
    V = amino_acids.get('V', 0)
    I = amino_acids.get('I', 0)
    L = amino_acids.get('L', 0)
    total_residues = sum(amino_acids.values())

    if total_residues == 0:
        return 0.0

    index = 100 * (A/total_residues + 2.9 * (V/total_residues) + 3.9 * ((I + L)/total_residues))
    return index

In [4]:
data_features = pd.read_csv("D:\python_progs\Final_year_proj\Datasets\seq.csv")

features_to_calculate = [
    "Molecular_Weight", "Isoelectric_Point", "Aromaticity",
    "Instability_Index", "Aliphatic_Index", "Net_Charge_pH7",
    "Hydrophobicity_GRAVY", "Positive_Residues", "Negative_Residues"
]

  data_features = pd.read_csv("D:\python_progs\Final_year_proj\Datasets\seq.csv")


In [5]:

for col in features_to_calculate:
    data_features[col] = None

valid_amino_acids = set("ARNDCQEGHILKMFPSTWYV")
skipped_rows = 0

for i, row in data_features.iterrows():
    raw_seq = str(row["Sequence"])
    
    try:
        sequence = raw_seq.upper().replace(" ", "")
        sequence = sequence.replace("B", "N").replace("Z", "Q").replace("X", "")
        sequence = "".join([aa for aa in sequence if aa in valid_amino_acids])

        if len(sequence) == 0:
            print(f"Warning: Row {i} has an empty or invalid sequence after filtering. Skipping calculation.")
            data_features.loc[i, "Molecular_Weight"] = 'N/A'
            skipped_rows += 1
            continue

        analysis = ProteinAnalysis(sequence)

        data_features.loc[i, "Molecular_Weight"] = analysis.molecular_weight()
        data_features.loc[i, "Isoelectric_Point"] = analysis.isoelectric_point()
        data_features.loc[i, "Aromaticity"] = analysis.aromaticity()
        data_features.loc[i, "Instability_Index"] = analysis.instability_index()
        data_features.loc[i, "Aliphatic_Index"] = calculate_aliphatic_index(sequence)
        data_features.loc[i, "Net_Charge_pH7"] = analysis.charge_at_pH(7.0)
        data_features.loc[i, "Hydrophobicity_GRAVY"] = analysis.gravy()

        amino_acid_counts = Counter(sequence)
        data_features.loc[i, "Positive_Residues"] = amino_acid_counts.get("K", 0) + amino_acid_counts.get("R", 0) + amino_acid_counts.get("H", 0)
        data_features.loc[i, "Negative_Residues"] = amino_acid_counts.get("D", 0) + amino_acid_counts.get("E", 0)

    except Exception as e:
        print(f"Processing Error at row {i} (Sequence: {raw_seq[:30]}...): {e}")
        data_features.loc[i, "Molecular_Weight"] = 'ERROR'
        skipped_rows += 1



In [7]:
output_file = "D:\python_progs\Final_year_proj\Datasets\protein_physicochemical_features_report.csv"
data_features.to_csv(output_file, index=False)

rows_processed = len(data_features) - skipped_rows

print("\n--- Feature Calculation Summary ---")
print(f"Feature calculation completed.")
print(f"Total input rows: {len(data_features)}")
print(f"Rows successfully processed: {rows_processed}")
print(f"Rows skipped or with errors: {skipped_rows}")

  output_file = "D:\python_progs\Final_year_proj\Datasets\protein_physicochemical_features_report.csv"



--- Feature Calculation Summary ---
Feature calculation completed.
Total input rows: 20420
Rows successfully processed: 20420
Rows skipped or with errors: 0


## Combining Datasets

In [5]:
# Load your two feature files
df_features = pd.read_csv("D:\\python_progs\\Final_year_proj\\Datasets\\protein_physicochemical_features_report.csv") 
df_ppi = pd.read_csv("D:\\python_progs\\Final_year_proj\\Datasets\\ppi_centrality.csv")

# Mapping the key
df_features.rename(columns={'Entry': 'Protein'}, inplace=True)


In [6]:
# Perform a LEFT merge
df_combined = pd.merge(df_features, df_ppi, on='Protein', how='left')

ppi_column_names = [col for col in df_ppi.columns if col != 'Protein']

# Fill all 'NaN' values in the PPI columns with 0.
df_combined[ppi_column_names] = df_combined[ppi_column_names].fillna(0)


print(f"Original feature rows: {len(df_features)}")
print(f"Final combined rows:   {len(df_combined)}")


Original feature rows: 20420
Final combined rows:   20420


In [7]:
df_go = pd.read_csv("D:\\python_progs\\Final_year_proj\\Datasets\\protein_go_features.csv")
df_go_unique = df_go.drop_duplicates(subset=['Entry'], keep='first')
df_go_unique.rename(columns={'Entry': 'Protein'}, inplace=True)

df_final_with_go = pd.merge(df_combined, df_go_unique, on='Protein', how='left')

df_final_with_go.to_csv("D:\\python_progs\\Final_year_proj\\Datasets\\final_training_dataset.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_go_unique.rename(columns={'Entry': 'Protein'}, inplace=True)


## Adding the Target Column

In [12]:
# Load the Essential Genes List
with open('CSEGs_CEGs.txt', 'r') as f:
    all_lines = f.readlines()

header_line = "gene\tessentiality\tncbi_id\tensembl\n"
header_index = -1
for i, line in enumerate(all_lines):
    if line == header_line:
        header_index = i
        break

# Load the CSV, starting from the header
essential_df = pd.read_csv(
    'CSEGs_CEGs.txt',
    sep='\t',
    skiprows=header_index,
    header=0
)

essential_genes_set = set(essential_df['gene'])
print(f"✅ Loaded {len(essential_genes_set)} total essential genes into a set.")


✅ Loaded 2482 total essential genes into a set.


In [14]:
# --- 2. Load Your Main Feature Dataset ---
df_main = pd.read_csv("D:\\python_progs\\Final_year_proj\\Datasets\\final_training_dataset.csv") 
print(f"✅ Loaded main dataset with {len(df_main)} rows.")


def label_essential(gene_names_str):
    if not isinstance(gene_names_str, str):
        return 0 # Not essential if name is missing
    
    gene_list = gene_names_str.split()
    
    # Check if any gene in the list is in our essential set
    for gene in gene_list:
        if gene in essential_genes_set:
            return 1 # It's essential
    
    return 0 

# --- 4. Create the 'TARGET' Column ---
print("Labeling proteins...")
df_main['TARGET'] = df_main['Gene Names'].apply(label_essential)


print("\nLabeling complete! Distribution of the new 'TARGET' column:")
print(df_main['TARGET'].value_counts())

df_main.to_csv("D:\\python_progs\\Final_year_proj\\Datasets\\features_with_target.csv", index=False)

✅ Loaded main dataset with 20420 rows.
Labeling proteins...

Labeling complete! Distribution of the new 'TARGET' column:
TARGET
0    17909
1     2511
Name: count, dtype: int64
