<a href="https://colab.research.google.com/github/DeepHMS/BrainProt-Marker_Selector/blob/main/Feature_Selection_Algorithms_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import glob, os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# **Data**

In [None]:
def create_data(metadata,data):
  ids = metadata['sampleId'].values
  target = metadata['groupI'].values
  d = dict(zip(ids, target))
  first_row = pd.DataFrame({i:d.get(i, "") for i in data.columns}, index=[0])
  data = pd.concat([first_row, data]).reset_index(drop = True)
  return data

In [None]:
def processed_data(data):
  data_transposed = data.transpose()
  data_transposed = data_transposed.iloc[1:, :]
  data_transposed.rename(index={'geneName':'SampleId'},inplace=True)
  return data_transposed

In [None]:
#Function to preprocess_data
def data_preprocessing(X):
  #Standard Scaler since we are constrained by any any value range
  scaler = StandardScaler().fit(X)
  X_scaled = scaler.transform(X)
  return X_scaled

In [None]:
def get_X_and_y(metadata, data):
  df = create_data(metadata, data)
  processed_df = processed_data(df)
  processed_df.iloc[0,0] = "Target"
  y = processed_df.iloc[1:, 0].values.astype('float')
  X = processed_df.iloc[1:, 1:].values
  X_scaled = data_preprocessing(X)
  return X_scaled, y

In [None]:
def min_max_scale(df):
  scaler = MinMaxScaler()
  d = scaler.fit_transform(df)
  scaled_df = pd.DataFrame(d,columns=df.columns)
  return scaled_df

#**Pearson Coefficient Correlation (PCC)**

In [None]:
from scipy.stats import pearsonr

In [None]:

def PCC(X, y):
  # Compute the Pearson correlation coefficient scores for all features
  pearson_scores = []
  for i in range(X.shape[1]):
      corr, _ = pearsonr(X[:,i], y)
      pearson_scores.append(abs(corr))

  # Normalize the scores so that they sum up to 1
  pearson_scores = np.array(pearson_scores)
  pearson_scores = pearson_scores / np.sum(pearson_scores)
  return pearson_scores

In [None]:
pearson_scores = PCC(X_scaled, y)
pcc_df = pd.DataFrame({'PCC': pearson_scores})

#**Effective Range Based Gene Selection(ERGBS)**

In [None]:
def ergs(X,y):
  num_features = X.shape[1]

  df = pd.DataFrame(X, columns = range(num_features))
  df['Target'] = y

  u_0 = np.array(np.mean(df[df['Target']==0].drop('Target', axis=1).values, axis = 0))
  u_1 = np.array(np.mean(df[df['Target']==1].drop('Target', axis=1).values,axis = 0))
  std_0 = np.array(np.std(df[df['Target']==0].drop('Target', axis=1).values, axis = 0))
  std_1 = np.array(np.std(df[df['Target']==1].drop('Target', axis=1).values, axis = 0))
  p_0 = len(df[df['Target']==0])/len(df['Target'])
  p_1 = 1-p_0

  c0 = list(zip(u_0 - (1-p_0)*1.732*std_0, u_0 + (1-p_0)*1.732*std_0))
  c1 = list(zip(u_1 - (1-p_1)*1.732*std_1, u_1 + (1-p_1)*1.732*std_1))
  R = [sorted(list(el)) for el in zip(c0, c1)]

  oa = np.zeros(num_features)
  ac = np.zeros(num_features)
  nac = np.zeros(num_features)

  for i in range(num_features):
    diff = R[i][0][1] - R[i][1][0]
    if diff > 0:
      oa[i] = diff
    ac[i] = oa[i] / (max(R[i][0][1], R[i][1][1]) - min(R[i][0][0], R[i][1][0]))

  max_ac = np.max(ac)
  w = np.zeros(num_features)
  for i in range(num_features):
    nac[i] = ac[i] / max_ac
    w[i] = 1 - nac[i]

  return w

In [None]:
ergs_scores = ergs(X_scaled, y)
ergs_df = pd.DataFrame({'ERGS': ergs_scores})

#**IFSER Method**

The main idea of the IFSER method described in the paper is to use a two-step process for feature selection. In the first step, the method calculates the effective range of each feature based on its correlation with the class labels. In the second step, the method uses a criterion based on mutual information to select the most informative features from the ones with a significant effective range.

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
def ifser(X,y):
    print(len(y))
    num_features = X.shape[1]

    df = pd.DataFrame(X, columns = range(num_features))
    df['Target'] = y

    df_1 = df[df['Target']==1]
    df_0 = df[df['Target']==0]
    n_1 = int(np.sum(y))
    n_0 = len(y)-n_1

    u_0 = np.array(np.mean(df[df['Target']==0].drop('Target', axis=1).values, axis = 0))
    u_1 = np.array(np.mean(df[df['Target']==1].drop('Target', axis=1).values,axis = 0))
    std_0 = np.array(np.std(df[df['Target']==0].drop('Target', axis=1).values, axis = 0))
    std_1 = np.array(np.std(df[df['Target']==1].drop('Target', axis=1).values, axis = 0))
    p_0 = len(df[df['Target']==0])/len(df['Target'])
    p_1 = 1-p_0

    c0 = list(zip(u_0 - (1-p_0)*1.732*std_0, u_0 + (1-p_0)*1.732*std_0))
    c1 = list(zip(u_1 - (1-p_1)*1.732*std_1, u_1 + (1-p_1)*1.732*std_1))
    R = [sorted(list(el)) for el in zip(c0, c1)]

    oa = np.zeros(num_features)
    ia = np.zeros(num_features)
    ac = np.zeros(num_features)
    h = np.zeros(num_features)
    g = np.zeros(num_features)
    nac = np.zeros(num_features)

    for i in range(num_features):
        diff_oa = R[i][0][1] - R[i][1][0]
        diff_ia = R[i][1][1] - R[i][1][0]
        if diff_oa > 0:
            oa[i] = diff_oa
        if diff_ia > 0:
            ia[i] = diff_ia
        ac[i] = (oa[i] + ia[i]) / (max(R[i][0][1], R[i][1][1]) - min(R[i][0][0], R[i][1][0]))

    for i in range(num_features):
        if oa[i] > 0:
            H_i_0 = len(df_0[i].loc[(df_0[i] >= R[i][1][0]) & (df_0[i] <= R[i][0][1])])
            H_i_1 = len(df_1[i].loc[(df_1[i] >= R[i][1][0]) & (df_1[i] <= R[i][0][1])])
        else:
            H_i_0 = 0
            H_i_1 = 0
        if ac[i] > 0:
            G_i_0 = len(df_0[i].loc[(df_0[i] >= R[i][1][0]) & (df_0[i] <= R[i][1][1])])
            G_i_1 = len(df_1[i].loc[(df_1[i] >= R[i][1][0]) & (df_1[i] <= R[i][1][1])])
        else:
            G_i_0 = 0
            G_i_1 = 0
        h[i] = H_i_0/n_0 + H_i_1/n_1
        g[i] = G_i_0/n_0 + G_i_1/n_1

    print("h", h)
    print("g", g)

    h_max = h[h.argmax()]
    g_max = g[g.argmax()]

    nh = 1-(h/h_max)
    gh = 1-(g/g_max)

    print(nh, gh)

    max_ac = np.max(ac)
    w = np.zeros(num_features)
    for i in range(num_features):
        nac[i] = ac[i] / max_ac
        w[i] = (1 - nac[i])*(nh[i]+gh[i])

    return w

In [None]:
ifser_scores = ifser(X_scaled, y)
ifser_df = pd.DataFrame({'IFSER': ifser_scores})

# **Relief-F**

In [None]:
!pip install skrebate
from skrebate import ReliefF

Collecting skrebate
  Downloading skrebate-0.62.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: skrebate
  Building wheel for skrebate (setup.py) ... [?25l[?25hdone
  Created wheel for skrebate: filename=skrebate-0.62-py3-none-any.whl size=29253 sha256=50330f8d107e609d881f686f80cfe79aa77f4d8d0e546566bdb2c5dcfeeb35b0
  Stored in directory: /root/.cache/pip/wheels/dd/67/40/683074a684607162bd0e34dcf7ccdfcab5861c3b2a83286f3a
Successfully built skrebate
Installing collected packages: skrebate
Successfully installed skrebate-0.62


In [None]:
def relief_F(X, y):
  selector = ReliefF()
  # Compute the ReliefF scores for all features
  selector.fit(X, y)
  # Get the ReliefF scores for all features
  scores = selector.feature_importances_
  return scores

In [None]:
relief_scores = relief_F(X_scaled, y)
relief_df = pd.DataFrame({'RF': relief_scores})

# Information Gain

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
def information_gain(X, y):
  return mutual_info_classif(X, y)

In [None]:
ig_scores = information_gain(X_scaled, y)
ig_df = pd.DataFrame({'IG': ig_scores})

# Gain Ratio

In [None]:
def gain_ratio(X, y):
  # Compute the gain ratio scores for all features
  mi_scores = mutual_info_classif(X, y, discrete_features="auto", random_state=42)
  gain_ratio_scores = []
  for i in range(X.shape[1]):
      if mi_scores[i] == 0:
          gain_ratio_scores.append(0)
      else:
          split_info = -1 * np.sum([p * np.log2(p) for p in [mi_scores[i], 1 - mi_scores[i]]])
          gain_ratio_scores.append(mi_scores[i] / split_info)

  # Normalize the scores so that they sum up to 1
  gain_ratio_scores = np.array(gain_ratio_scores)
  gain_ratio_scores = gain_ratio_scores / np.sum(gain_ratio_scores)
  return gain_ratio_scores

In [None]:
gain_ratio_scores = gain_ratio(X_scaled, y)
gr_df = pd.DataFrame({'GR': gain_ratio_scores})

# **SVM Weight Vector**

In [None]:
from sklearn.svm import LinearSVC

In [None]:
def support_vector_machine_weight_vector(X, y):
  # Train a linear SVM model
  svm_model = LinearSVC(random_state=42)
  svm_model.fit(X, y)
  # Compute the SVM weight vector scores for all features
  svm_weight_vector = np.abs(svm_model.coef_)[0]
  svm_weight_vector_scores = svm_weight_vector / np.sum(svm_weight_vector)
  return svm_weight_vector_scores

In [None]:
svm_weight_vector_scores = support_vector_machine_weight_vector(X_scaled, y)
svm_df = pd.DataFrame({'SVMWV': svm_weight_vector_scores})

# **minium Redundancy Maximum Relevance**

In [None]:
from sklearn.feature_selection import f_regression

# inputs:
#    X: pandas.DataFrame, features
#    y: pandas.Series, target variable
#    K: number of features to select
def mrmr_fcq(X, y, K):
    # compute F-statistics and initialize correlation matrix
    F = pd.Series(f_regression(X, y)[0], index = X.columns)
    corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

    # initialize list of selected features and list of excluded features
    selected = []
    score_selected = []
    not_selected = X.columns.to_list()

    # repeat K times
    for i in range(K):

        # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
        if i > 0:
            last_selected = selected[-1]
            corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)

        # compute FCQ score for all the (currently) excluded features (this is Formula 2)
        score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)

        # find best feature, add it to selected and remove it from not_selected
        max_score = score.argmax()
        best = score.index[max_score]
        selected.append(best)
        score_selected.append({'geneName': best, 'score': max_score})
        not_selected.remove(best)

    return selected

# Getting Cumulative Results_BDTM



In [None]:
def store_results_for_datasets_BDTM(metadata_files, input_files, base_folder_path):
  for i in range(len(metadata_files)):
    metadata = pd.read_csv(metadata_files[i])
    data = pd.read_excel(input_files[i])
    folder_name = input_files[i].split("/")[-1].split("Input")[0].strip("_").strip()
    print(folder_name)
    folder_path = base_folder_path + '/' + folder_name
    try:
      os.makedirs(folder_path)
    except FileExistsError:
      # directory already exists
      pass
    gene_names_df = data['geneName']
    gene_names_df.to_csv(folder_path + '/gene_names.csv')
    X_scaled, y = get_X_and_y(metadata, data)
    ergs_scores = ergs(X_scaled, y)
    ergs_df = pd.DataFrame({'ERGS': ergs_scores})
    ergs_df.to_csv(folder_path + '/ergs.csv')
    ifser_scores = ifser(X_scaled, y)
    ifser_df = pd.DataFrame({'IFSER': ifser_scores})
    ifser_df.to_csv(folder_path + '/ifser.csv')
    pearson_scores = PCC(X_scaled, y)
    pcc_df = pd.DataFrame({'PCC': pearson_scores})
    pcc_df.to_csv(folder_path + '/pcc.csv')
    relief_scores = relief_F(X_scaled, y)
    relief_df = pd.DataFrame({'RF': relief_scores})
    relief_df.to_csv(folder_path + '/relief.csv')
    svm_weight_vector_scores = support_vector_machine_weight_vector(X_scaled, y)
    svm_df = pd.DataFrame({'SVMWV': svm_weight_vector_scores})
    svm_df.to_csv(folder_path + '/svmwv.csv')


In [None]:
def get_cumulative_results_without_FC_BDTM(gse_folders_path, path_to_save=None):
  for folder_path in gse_folders_path:
    dataset_name = folder_path.split('/')[-1]
    genes = pd.read_csv(folder_path + '/gene_names.csv')
    ergs = pd.read_csv(folder_path + '/ergs.csv')
    ifser = pd.read_csv(folder_path + '/ifser.csv')
    pcc = pd.read_csv(folder_path + '/pcc.csv')
    rf = pd.read_csv(folder_path + '/relief.csv')
    svmwv = pd.read_csv(folder_path + '/svmwv.csv')
    res = pd.concat([ergs[['ERGS']], ifser[['IFSER']], pcc[['PCC']], rf[['RF']], svmwv[['SVMWV']]], axis=1)
    res = min_max_scale(res)
    res['COMB'] = res['ERGS'] + res['IFSER'] + res['PCC'] + res['RF'] + res['SVMWV']
    res = pd.concat([genes[['geneName']], res], axis = 1)
    if path_to_save:
      try:
        os.makedirs(path_to_save)
      except FileExistsError:
        # directory already exists
        pass
      res.to_csv(path_to_save + '/' + dataset_name + '_' + 'cumulative_results.csv')
    else:
      res.to_csv(folder_path + '/' + dataset_name + '_' + 'cumulative_results.csv')

# Getting Cumulative Results_BDPM


In [None]:
def process_proteomic_data(data):
    # Extract and encode labels
    labels = data.iloc[0, 1:].map({'Control': 0, 'Disease': 1})
    data_processed = data.drop(0).set_index('proteinId').T
    data_processed['Target'] = labels.values
    data_processed = data_processed.apply(pd.to_numeric, errors='coerce')
    # Rename index to "SampleID"
    data_processed.index.name = "SampleID"
    data_processed.dropna(inplace = True)
    # Split into X and y
    X = data_processed.drop(columns=['Target'])
    y = data_processed['Target'].astype('float')
    X_scaled = data_preprocessing(X)
    return X_scaled, y.values

In [None]:
def store_results_for_datasets_BDPM(input_files, base_folder_path):
  for i in range(len(input_files)):
    data = pd.read_csv(input_files[i])
    folder_name = input_files[i].split("/")[-1].split("Input")[0].strip("_Group.csv").strip()
    print(folder_name)
    folder_path = base_folder_path + '/' + folder_name
    print(data.columns)
    try:
      os.makedirs(folder_path)
    except FileExistsError:
      # directory already exists
      pass
    gene_names_df = data['proteinId'][1:]
    gene_names_df.to_csv(folder_path + '/gene_names.csv')
    X_scaled, y = process_proteomic_data(data)
    ergs_scores = ergs(X_scaled, y)
    ergs_df = pd.DataFrame({'ERGS': ergs_scores})
    ergs_df.to_csv(folder_path + '/ergs.csv')
    ifser_scores = ifser(X_scaled, y)
    ifser_df = pd.DataFrame({'IFSER': ifser_scores})
    ifser_df.to_csv(folder_path + '/ifser.csv')
    pearson_scores = PCC(X_scaled, y)
    pcc_df = pd.DataFrame({'PCC': pearson_scores})
    pcc_df.to_csv(folder_path + '/pcc.csv')
    relief_scores = relief_F(X_scaled, y)
    relief_df = pd.DataFrame({'RF': relief_scores})
    relief_df.to_csv(folder_path + '/relief.csv')
    svm_weight_vector_scores = support_vector_machine_weight_vector(X_scaled, y)
    svm_df = pd.DataFrame({'SVMWV': svm_weight_vector_scores})
    svm_df.to_csv(folder_path + '/svmwv.csv')


In [None]:
def get_cumulative_results_without_FC_BDPM(gse_folders_path, path_to_save=None):
  for folder_path in gse_folders_path:
    dataset_name = folder_path.split('/')[-1]
    genes = pd.read_csv(folder_path + '/gene_names.csv')
    ergs = pd.read_csv(folder_path + '/ergs.csv')
    ifser = pd.read_csv(folder_path + '/ifser.csv')
    pcc = pd.read_csv(folder_path + '/pcc.csv')
    rf = pd.read_csv(folder_path + '/relief.csv')
    svmwv = pd.read_csv(folder_path + '/svmwv.csv')
    res = pd.concat([ergs[['ERGS']], ifser[['IFSER']], pcc[['PCC']], rf[['RF']], svmwv[['SVMWV']]], axis=1)
    res = min_max_scale(res)
    res['COMB'] = res['ERGS'] + res['IFSER'] + res['PCC'] + res['RF'] + res['SVMWV']
    res = pd.concat([genes[['proteinId']], res], axis = 1)
    if path_to_save:
      try:
        os.makedirs(path_to_save)
      except FileExistsError:
        # directory already exists
        pass
      res.to_csv(path_to_save + '/' + dataset_name + '_' + 'cumulative_results.csv')
    else:
      res.to_csv(folder_path + '/' + dataset_name + '_' + 'cumulative_results.csv')

# Cummulative Parkinsons

In [None]:
cummulative_df = pd.read_excel('/content/Human_Uniprot to Gene.xlsx')

## BDMC

In [None]:
df_bdmc_parkinsons = pd.read_csv('/content/drive/MyDrive/BrainProt ML Work/BDMC Data/BDMC_Final_Parkinsons Disease_D010300.csv')
df_bdmc_parkinsons.rename(columns={'geneName':'Gene'}, inplace=True)
df_bdmc_parkinsons = df_bdmc_parkinsons[['Gene',"BDMC_Score"]]

In [None]:
cummulative_df = pd.merge(cummulative_df, df_bdmc_parkinsons, on='Gene', how='left')
cummulative_df.head()

Unnamed: 0,Entry,Gene,BDMC_Score
0,A0A087X1C5,CYP2D7,2.65
1,A0A0B4J2F0,PIGBOS1,0.0
2,A0A0B4J2F2,SIK1B,0.0
3,A0A0C5B5G6,MT-RNR1,0.0
4,A0A0K2S4Q6,CD300H,0.0


## BDTM

In [None]:
parkinsons_bdtm_files = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Parkinsons/Cumulative Results/*.csv'))

In [None]:
for i in parkinsons_bdtm_files:
  df = pd.read_csv(i)
  df.drop('Unnamed: 0',axis = 1,inplace = True)
  dataset_name = i.split('/')[-1].split('_cumulative')[0]
  df.columns = [j if idx ==0 else j +'_'+dataset_name for idx,j in enumerate(df.columns)]
  df.rename(columns={'geneName':'Gene'}, inplace=True)
  print(df.columns)
  cummulative_df = pd.merge(cummulative_df, df, on='Gene', how='left')

Index(['Gene', 'EGRS_GSE19587', 'IFSER_GSE19587', 'PCC_GSE19587',
       'RF_GSE19587', 'SVMWV_GSE19587', 'COMB_GSE19587'],
      dtype='object')
Index(['Gene', 'EGRS_GSE20141', 'IFSER_GSE20141', 'PCC_GSE20141',
       'RF_GSE20141', 'SVMWV_GSE20141', 'COMB_GSE20141'],
      dtype='object')
Index(['Gene', 'EGRS_GSE20146', 'IFSER_GSE20146', 'PCC_GSE20146',
       'RF_GSE20146', 'SVMWV_GSE20146', 'COMB_GSE20146'],
      dtype='object')
Index(['Gene', 'EGRS_GSE20163', 'IFSER_GSE20163', 'PCC_GSE20163',
       'RF_GSE20163', 'SVMWV_GSE20163', 'COMB_GSE20163'],
      dtype='object')
Index(['Gene', 'EGRS_GSE20164', 'IFSER_GSE20164', 'PCC_GSE20164',
       'RF_GSE20164', 'SVMWV_GSE20164', 'COMB_GSE20164'],
      dtype='object')
Index(['Gene', 'EGRS_GSE20168', 'IFSER_GSE20168', 'PCC_GSE20168',
       'RF_GSE20168', 'SVMWV_GSE20168', 'COMB_GSE20168'],
      dtype='object')
Index(['Gene', 'EGRS_GSE20291', 'IFSER_GSE20291', 'PCC_GSE20291',
       'RF_GSE20291', 'SVMWV_GSE20291', 'COMB_GSE20291'],


## BDPM

In [None]:
parkinsons_bdpm_files = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Parkinsons/Cummulative_results/*.csv'))

In [None]:
parkinsons_bdpm_files

['/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Parkinsons/Cummulative_results/PXD008036_Processed_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Parkinsons/Cummulative_results/PXD022092_Processed_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Parkinsons/Cummulative_results/PXD024998_Processed_cumulative_results.csv']

In [None]:
for i in parkinsons_bdpm_files:
  df = pd.read_csv(i)
  df.drop('Unnamed: 0',axis = 1,inplace = True)
  dataset_name = i.split('/')[-1].split('_Processed')[0]
  print(dataset_name)
  df.columns = [j if idx ==0 else j +'_'+dataset_name for idx,j in enumerate(df.columns)]
  df.rename(columns={'proteinId':'Entry'}, inplace=True)
  print(df.columns)
  cummulative_df = pd.merge(cummulative_df, df, on='Entry', how='left')

PXD008036
Index(['Entry', 'EGRS_PXD008036', 'IFSER_PXD008036', 'PCC_PXD008036',
       'RF_PXD008036', 'SVMWV_PXD008036', 'COMB_PXD008036'],
      dtype='object')
PXD022092
Index(['Entry', 'EGRS_PXD022092', 'IFSER_PXD022092', 'PCC_PXD022092',
       'RF_PXD022092', 'SVMWV_PXD022092', 'COMB_PXD022092'],
      dtype='object')
PXD024998
Index(['Entry', 'EGRS_PXD024998', 'IFSER_PXD024998', 'PCC_PXD024998',
       'RF_PXD024998', 'SVMWV_PXD024998', 'COMB_PXD024998'],
      dtype='object')


In [None]:
cummulative_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20426 entries, 0 to 20425
Columns: 105 entries, Entry to COMB_PXD024998
dtypes: float64(103), object(2)
memory usage: 16.5+ MB


In [None]:
print(cummulative_df.isnull().sum(axis = 0))

Entry                  0
Gene                 133
BDMC_Score           191
EGRS_GSE19587       8131
IFSER_GSE19587      8131
                   ...  
IFSER_PXD024998    19393
PCC_PXD024998      19393
RF_PXD024998       19393
SVMWV_PXD024998    19393
COMB_PXD024998     19393
Length: 105, dtype: int64


In [None]:
cummulative_df.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Parkinsons/parkinsons_cummulative.csv')

In [None]:
cummulative_df_not_na = cummulative_df[cummulative_df['BDMC_Score'].notna()]

In [None]:
cummulative_df_not_na['BDMC_Score'] = np.maximum(cummulative_df_not_na['BDMC_Score'], 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cummulative_df_not_na['BDMC_Score'] = np.maximum(cummulative_df_not_na['BDMC_Score'], 0)


In [None]:
cummulative_df_not_na.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Parkinsons/parkinsons_cummulative_notna.csv')

In [None]:
cummulative_df_not_na = pd.read_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Parkinsons/parkinsons_cummulative_notna.csv')

In [None]:
cummulative_df_not_na.head()

Unnamed: 0.1,Unnamed: 0,Entry,Gene,BDMC_Score,EGRS_GSE19587,IFSER_GSE19587,PCC_GSE19587,RF_GSE19587,SVMWV_GSE19587,COMB_GSE19587,...,PCC_PXD022092,RF_PXD022092,SVMWV_PXD022092,COMB_PXD022092,EGRS_PXD024998,IFSER_PXD024998,PCC_PXD024998,RF_PXD024998,SVMWV_PXD024998,COMB_PXD024998
0,0,A0A087X1C5,CYP2D7,2.65,0.560514,0.143794,0.378514,0.123425,0.030906,1.237153,...,,,,,,,,,,
1,1,A0A0B4J2F0,PIGBOS1,0.0,,,,,,,...,,,,,,,,,,
2,2,A0A0B4J2F2,SIK1B,0.0,,,,,,,...,,,,,,,,,,
3,3,A0A0C5B5G6,MT-RNR1,0.0,,,,,,,...,,,,,,,,,,
4,4,A0A0K2S4Q6,CD300H,0.0,,,,,,,...,,,,,,,,,,


In [None]:
cummulative_df_not_na_helper = cummulative_df_not_na.drop(['Entry','Gene'],axis = 1)
cummulative_df_not_na_helper.drop('Unnamed: 0',axis = 1,inplace = True)


In [None]:
cummulative_df_not_na_helper.head()

Unnamed: 0,BDMC_Score,EGRS_GSE19587,IFSER_GSE19587,PCC_GSE19587,RF_GSE19587,SVMWV_GSE19587,COMB_GSE19587,EGRS_GSE20141,IFSER_GSE20141,PCC_GSE20141,...,PCC_PXD022092,RF_PXD022092,SVMWV_PXD022092,COMB_PXD022092,EGRS_PXD024998,IFSER_PXD024998,PCC_PXD024998,RF_PXD024998,SVMWV_PXD024998,COMB_PXD024998
0,2.65,0.560514,0.143794,0.378514,0.123425,0.030906,1.237153,0.725619,0.281555,0.459822,...,,,,,,,,,,
1,0.0,,,,,,,,,,...,,,,,,,,,,
2,0.0,,,,,,,,,,...,,,,,,,,,,
3,0.0,,,,,,,,,,...,,,,,,,,,,
4,0.0,,,,,,,,,,...,,,,,,,,,,


In [None]:
scaler = MinMaxScaler()
cummulative_df_not_na_helper_scaled = scaler.fit_transform(cummulative_df_not_na_helper)
cummulative_df_not_na_helper_scaled = pd.DataFrame(cummulative_df_not_na_helper_scaled,columns=cummulative_df_not_na_helper.columns)

In [None]:
cummulative_df_not_na_helper_scaled.head()

Unnamed: 0,BDMC_Score,EGRS_GSE19587,IFSER_GSE19587,PCC_GSE19587,RF_GSE19587,SVMWV_GSE19587,COMB_GSE19587,EGRS_GSE20141,IFSER_GSE20141,PCC_GSE20141,...,PCC_PXD022092,RF_PXD022092,SVMWV_PXD022092,COMB_PXD022092,EGRS_PXD024998,IFSER_PXD024998,PCC_PXD024998,RF_PXD024998,SVMWV_PXD024998,COMB_PXD024998
0,0.284916,0.560514,0.143794,0.378514,0.123425,0.030906,0.285139,0.725619,0.292927,0.459822,...,,,,,,,,,,
1,0.0,,,,,,,,,,...,,,,,,,,,,
2,0.0,,,,,,,,,,...,,,,,,,,,,
3,0.0,,,,,,,,,,...,,,,,,,,,,
4,0.0,,,,,,,,,,...,,,,,,,,,,


In [None]:
cummulative_df_not_na_scaled = pd.concat([cummulative_df_not_na[['Entry','Gene']],cummulative_df_not_na_helper_scaled],axis =1)

In [None]:
cummulative_df_not_na_scaled.head()

Unnamed: 0,Entry,Gene,BDMC_Score,EGRS_GSE19587,IFSER_GSE19587,PCC_GSE19587,RF_GSE19587,SVMWV_GSE19587,COMB_GSE19587,EGRS_GSE20141,...,PCC_PXD022092,RF_PXD022092,SVMWV_PXD022092,COMB_PXD022092,EGRS_PXD024998,IFSER_PXD024998,PCC_PXD024998,RF_PXD024998,SVMWV_PXD024998,COMB_PXD024998
0,A0A087X1C5,CYP2D7,0.284916,0.560514,0.143794,0.378514,0.123425,0.030906,0.285139,0.725619,...,,,,,,,,,,
1,A0A0B4J2F0,PIGBOS1,0.0,,,,,,,,...,,,,,,,,,,
2,A0A0B4J2F2,SIK1B,0.0,,,,,,,,...,,,,,,,,,,
3,A0A0C5B5G6,MT-RNR1,0.0,,,,,,,,...,,,,,,,,,,
4,A0A0K2S4Q6,CD300H,0.0,,,,,,,,...,,,,,,,,,,


In [None]:
cummulative_df_not_na_scaled.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Parkinsons/parkinsons_cummulative_notna_scaled1.csv')

In [None]:
parkinsons_not_na = pd.read_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Parkinsons/parkinsons_cummulative_notna.csv')

In [None]:
parkinsons_not_na.fillna(0, inplace=True)

In [None]:
parkinsons_not_na_helper = parkinsons_not_na.drop(['Entry','Gene'],axis = 1)
parkinsons_not_na_helper.drop('Unnamed: 0',axis = 1,inplace = True)

In [None]:
parkinsons_not_na_helper

Unnamed: 0,BDMC_Score,EGRS_GSE19587,IFSER_GSE19587,PCC_GSE19587,RF_GSE19587,SVMWV_GSE19587,COMB_GSE19587,EGRS_GSE20141,IFSER_GSE20141,PCC_GSE20141,...,PCC_PXD022092,RF_PXD022092,SVMWV_PXD022092,COMB_PXD022092,EGRS_PXD024998,IFSER_PXD024998,PCC_PXD024998,RF_PXD024998,SVMWV_PXD024998,COMB_PXD024998
0,2.65,0.560514,0.143794,0.378514,0.123425,0.030906,1.237153,0.725619,0.281555,0.459822,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20230,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20231,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20232,0.00,0.010817,0.058113,0.296240,0.013903,0.032973,0.412046,0.269751,0.272359,0.094921,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20233,0.00,0.038400,0.016007,0.022609,0.012773,0.195865,0.285654,0.280479,0.228000,0.119788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
scaler = MinMaxScaler()
parkinsons_not_na_helper_scaled = scaler.fit_transform(parkinsons_not_na_helper)
parkinsons_not_na_helper_scaled = pd.DataFrame(parkinsons_not_na_helper_scaled,columns=parkinsons_not_na_helper.columns)

In [None]:
sorted_columns = parkinsons_not_na_helper_scaled.columns.sort_values()

In [None]:
sorted_columns

Index(['BDMC_Score', 'COMB_GSE19587', 'COMB_GSE20141', 'COMB_GSE20146',
       'COMB_GSE20163', 'COMB_GSE20164', 'COMB_GSE20168', 'COMB_GSE20291',
       'COMB_GSE20292', 'COMB_GSE20314',
       ...
       'SVMWV_GSE20292', 'SVMWV_GSE20314', 'SVMWV_GSE20333', 'SVMWV_GSE24378',
       'SVMWV_GSE7621', 'SVMWV_GSE8397_U133A', 'SVMWV_GSE8397_U133B',
       'SVMWV_PXD008036', 'SVMWV_PXD022092', 'SVMWV_PXD024998'],
      dtype='object', length=103)

In [None]:
parkinsons_not_na_helper_scaled_result = parkinsons_not_na_helper_scaled[sorted_columns]

In [None]:
parkinsons_not_na_helper_scaled_result

Unnamed: 0,BDMC_Score,COMB_GSE19587,COMB_GSE20141,COMB_GSE20146,COMB_GSE20163,COMB_GSE20164,COMB_GSE20168,COMB_GSE20291,COMB_GSE20292,COMB_GSE20314,...,SVMWV_GSE20292,SVMWV_GSE20314,SVMWV_GSE20333,SVMWV_GSE24378,SVMWV_GSE7621,SVMWV_GSE8397_U133A,SVMWV_GSE8397_U133B,SVMWV_PXD008036,SVMWV_PXD022092,SVMWV_PXD024998
0,0.284916,0.293927,0.382192,0.224807,0.331487,0.211215,0.259423,0.256864,0.373759,0.465426,...,0.303210,0.281585,0.000000,0.000000,0.104165,0.186913,0.000000,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20230,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
20231,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
20232,0.000000,0.097895,0.178867,0.323297,0.278243,0.517240,0.344107,0.050636,0.231612,0.803679,...,0.245712,0.815853,0.094420,0.070045,0.016408,0.134332,0.084545,0.0,0.0,0.0
20233,0.000000,0.067867,0.203262,0.186180,0.663446,0.141713,0.309339,0.043965,0.221311,0.245923,...,0.096627,0.163526,0.010915,0.150697,0.173493,0.097706,0.000000,0.0,0.0,0.0


In [None]:
parkinsons_not_na_scaled = pd.concat([parkinsons_not_na[['Entry','Gene']],parkinsons_not_na_helper_scaled_result],axis =1)

In [None]:
parkinsons_not_na_scaled.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Parkinsons/parkinsons_cummulative_notna_scaled.csv')

# Cummulative_Alzheimers

In [None]:
cummulative_df_alz = pd.read_excel('/content/Human_Uniprot to Gene.xlsx')

## BDMC

In [None]:
df_bdmc_alz = pd.read_csv('/content/drive/MyDrive/BrainProt ML Work/BDMC Data/BDMC_Final_Alzheimers Disease_D000544.csv')
df_bdmc_alz.rename(columns={'geneName':'Gene'}, inplace=True)
df_bdmc_alz = df_bdmc_alz[['Gene',"BDMC_Score"]]

In [None]:
cummulative_df_alz = pd.merge(cummulative_df_alz, df_bdmc_alz, on='Gene', how='left')
cummulative_df_alz.head()

Unnamed: 0,Entry,Gene,BDMC_Score
0,A0A087X1C5,CYP2D7,2.25
1,A0A0B4J2F0,PIGBOS1,0.0
2,A0A0B4J2F2,SIK1B,0.0
3,A0A0C5B5G6,MT-RNR1,0.0
4,A0A0K2S4Q6,CD300H,0.0


## BDTM

In [None]:
input_files_alz = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDTM Data/7. Alzheimer_s/*.xlsx'))
metadata_files_alz = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDTM Data/7. Alzheimer_s/*.csv'))
base_folder_path_alz = '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers'

In [None]:
store_results_for_datasets_BDTM(metadata_files_alz,input_files_alz, base_folder_path_alz)

GSE12685
14
h [1.58333333 1.41666667 1.41666667 ... 0.79166667 1.58333333 1.41666667]
g [1.58333333 0.95833333 0.875      ... 1.20833333 1.16666667 1.16666667]
[0.15555556 0.24444444 0.24444444 ... 0.57777778 0.15555556 0.24444444] [0.0952381  0.45238095 0.5        ... 0.30952381 0.33333333 0.33333333]
GSE1297
31
h [0.74242424 0.60606061 1.25757576 ... 0.87373737 0.8030303  0.89393939]
g [0.74242424 1.16666667 1.21212121 ... 0.82828283 1.0959596  1.14141414]
[0.57636888 0.65417867 0.28242075 ... 0.50144092 0.54178674 0.48991354] [0.56508876 0.31656805 0.28994083 ... 0.5147929  0.35798817 0.33136095]
GSE16759
8
h [1.5  1.25 0.5  ... 0.5  0.75 0.  ]
g [1.5  0.75 1.   ... 1.   1.   1.  ]
[0.14285714 0.28571429 0.71428571 ... 0.71428571 0.57142857 1.        ] [0.14285714 0.57142857 0.42857143 ... 0.42857143 0.42857143 0.42857143]
GSE28146
30
h [1.47727273 1.21590909 0.78409091 ... 1.18181818 1.20454545 1.07954545]
g [0.78409091 0.81818182 1.52272727 ... 0.70454545 0.90909091 0.77272727]
[0



GSE5281
161
h [1.09972041 1.02267785 0.14600808 ... 1.23283628 1.23019571 0.99627213]
g [1.33364399 1.02267785 0.80444237 ... 1.23283628 1.10515688 1.11727244]
[0.41535921 0.45631709 0.9223782  ... 0.34459125 0.34599505 0.47035508] [0.28664008 0.45297441 0.56970754 ... 0.34056165 0.40885676 0.4023762 ]


In [None]:
get_cumulative_results_without_FC_BDTM(gse_folders_path=glob.glob("/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers/GSE*"), path_to_save = "/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers/Cummulative_results")

In [None]:
alz_cummu_res_files_bdtm = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers/Cummulative_results/*.csv'))

In [None]:
alz_cummu_res_files_bdtm

['/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers/Cummulative_results/GSE12685_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers/Cummulative_results/GSE1297_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers/Cummulative_results/GSE16759_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers/Cummulative_results/GSE28146_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers/Cummulative_results/GSE4757_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers/Cummulative_results/GSE48350_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Alzheimers/Cummulative_results/GSE5281_cumulative_results.csv']

In [None]:
for i in alz_cummu_res_files_bdtm:
  df = pd.read_csv(i)
  df.drop('Unnamed: 0',axis = 1,inplace = True)
  dataset_name = i.split('/')[-1].split('_cumulative')[0]
  df.columns = [j if idx ==0 else j +'_'+dataset_name for idx,j in enumerate(df.columns)]
  df.rename(columns={'geneName':'Gene'}, inplace=True)
  print(df.columns)
  cummulative_df_alz = pd.merge(cummulative_df_alz, df, on='Gene', how='left')

Index(['Gene', 'ERGS_GSE12685', 'IFSER_GSE12685', 'PCC_GSE12685',
       'RF_GSE12685', 'SVMWV_GSE12685', 'COMB_GSE12685'],
      dtype='object')
Index(['Gene', 'ERGS_GSE1297', 'IFSER_GSE1297', 'PCC_GSE1297', 'RF_GSE1297',
       'SVMWV_GSE1297', 'COMB_GSE1297'],
      dtype='object')
Index(['Gene', 'ERGS_GSE16759', 'IFSER_GSE16759', 'PCC_GSE16759',
       'RF_GSE16759', 'SVMWV_GSE16759', 'COMB_GSE16759'],
      dtype='object')
Index(['Gene', 'ERGS_GSE28146', 'IFSER_GSE28146', 'PCC_GSE28146',
       'RF_GSE28146', 'SVMWV_GSE28146', 'COMB_GSE28146'],
      dtype='object')
Index(['Gene', 'ERGS_GSE4757', 'IFSER_GSE4757', 'PCC_GSE4757', 'RF_GSE4757',
       'SVMWV_GSE4757', 'COMB_GSE4757'],
      dtype='object')
Index(['Gene', 'ERGS_GSE48350', 'IFSER_GSE48350', 'PCC_GSE48350',
       'RF_GSE48350', 'SVMWV_GSE48350', 'COMB_GSE48350'],
      dtype='object')
Index(['Gene', 'ERGS_GSE5281', 'IFSER_GSE5281', 'PCC_GSE5281', 'RF_GSE5281',
       'SVMWV_GSE5281', 'COMB_GSE5281'],
      dtype='objec

## BDPM

In [None]:
alz_bdpm_files = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDPM Data/7. Alzheimer_s/*.csv'))
base_folder_path_alz = '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers'

In [None]:
store_results_for_datasets_BDPM(alz_bdpm_files, base_folder_path_alz)

PXD005319_Processed
Index(['proteinId', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9',
       'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16'],
      dtype='object')
16
h [0.8974359  0.33333333 0.07692308 ... 0.         0.8974359  0.        ]
g [0.33333333 1.15384615 1.46153846 ... 0.8974359  0.8974359  1.        ]
[0.51388889 0.81944444 0.95833333 ... 1.         0.51388889 1.        ] [0.81944444 0.375      0.20833333 ... 0.51388889 0.51388889 0.45833333]
PXD005321_Processed
Index(['proteinId', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9',
       'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18'],
      dtype='object')
18
h [1.49230769 1.41538462 0.         ... 0.07692308 0.35384615 1.09230769]
g [0.50769231 0.58461538 1.10769231 ... 0.95384615 1.13846154 0.43076923]
[0.13392857 0.17857143 1.         ... 0.95535714 0.79464286 0.36607143] [0.71304348 0.66956522 0.37391304 ... 0.46086957 0.35652174 0.75652174]
PXD009199_Processed
Index(['proteinId', 'S1', 'S2'

In [None]:
get_cumulative_results_without_FC_BDPM(gse_folders_path=glob.glob("/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers/PXD*"), path_to_save = "/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers/Cummulative_results")

In [None]:
alz_cummu_res_files = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers/Cummulative_results/*.csv'))

In [None]:
alz_cummu_res_files

['/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers/Cummulative_results/PXD005319_Processed_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers/Cummulative_results/PXD005321_Processed_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers/Cummulative_results/PXD009199_Processed_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers/Cummulative_results/PXD014376_Processed_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers/Cummulative_results/PXD023199_Processed_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers/Cummulative_results/PXD027173_Processed_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Alzheimers/Cummulative_results/PXD037133_Proces

In [None]:
for i in alz_cummu_res_files:
  df = pd.read_csv(i)
  df.drop('Unnamed: 0',axis = 1,inplace = True)
  dataset_name = i.split('/')[-1].split('_Processed')[0]
  print(dataset_name)
  df.columns = [j if idx ==0 else j +'_'+dataset_name for idx,j in enumerate(df.columns)]
  df.rename(columns={'proteinId':'Entry'}, inplace=True)
  print(df.columns)
  cummulative_df_alz = pd.merge(cummulative_df_alz, df, on='Entry', how='left')

PXD005319
Index(['Entry', 'ERGS_PXD005319', 'IFSER_PXD005319', 'PCC_PXD005319',
       'RF_PXD005319', 'SVMWV_PXD005319', 'COMB_PXD005319'],
      dtype='object')
PXD005321
Index(['Entry', 'ERGS_PXD005321', 'IFSER_PXD005321', 'PCC_PXD005321',
       'RF_PXD005321', 'SVMWV_PXD005321', 'COMB_PXD005321'],
      dtype='object')
PXD009199
Index(['Entry', 'ERGS_PXD009199', 'IFSER_PXD009199', 'PCC_PXD009199',
       'RF_PXD009199', 'SVMWV_PXD009199', 'COMB_PXD009199'],
      dtype='object')
PXD014376
Index(['Entry', 'ERGS_PXD014376', 'IFSER_PXD014376', 'PCC_PXD014376',
       'RF_PXD014376', 'SVMWV_PXD014376', 'COMB_PXD014376'],
      dtype='object')
PXD023199
Index(['Entry', 'ERGS_PXD023199', 'IFSER_PXD023199', 'PCC_PXD023199',
       'RF_PXD023199', 'SVMWV_PXD023199', 'COMB_PXD023199'],
      dtype='object')
PXD027173
Index(['Entry', 'ERGS_PXD027173', 'IFSER_PXD027173', 'PCC_PXD027173',
       'RF_PXD027173', 'SVMWV_PXD027173', 'COMB_PXD027173'],
      dtype='object')
PXD037133
Index(['Entr

## Saving Data

In [None]:
cummulative_df_alz.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Alzheimers/alzheimers_cummulative.csv')

In [None]:
cummulative_df_alz_not_na = cummulative_df_alz[cummulative_df_alz['BDMC_Score'].notna()]
cummulative_df_alz_not_na = cummulative_df_alz_not_na.reset_index(drop=True)
cummulative_df_alz_not_na['BDMC_Score'] = np.maximum(cummulative_df_alz_not_na['BDMC_Score'], 0)

In [None]:
cummulative_df_alz_not_na.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Alzheimers/alzheimers_cummulative_notna.csv')

In [None]:
cummulative_df_alz_not_na.fillna(0, inplace=True)

In [None]:
cummulative_df_alz_not_na

Unnamed: 0,Entry,Gene,BDMC_Score,ERGS_GSE12685,IFSER_GSE12685,PCC_GSE12685,RF_GSE12685,SVMWV_GSE12685,COMB_GSE12685,ERGS_GSE1297,...,PCC_PXD027173,RF_PXD027173,SVMWV_PXD027173,COMB_PXD027173,ERGS_PXD037133,IFSER_PXD037133,PCC_PXD037133,RF_PXD037133,SVMWV_PXD037133,COMB_PXD037133
0,A0A087X1C5,CYP2D7,2.25,0.473323,0.330617,0.287753,0.106144,0.155670,1.353507,0.299006,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A0A0B4J2F0,PIGBOS1,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A0A0B4J2F2,SIK1B,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A0A0C5B5G6,MT-RNR1,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A0A0K2S4Q6,CD300H,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20230,Q9UI54,PRO0628,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20231,Q9UI72,PRO0255,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20232,Q9Y6C7,LINC00312,0.00,0.664600,0.393045,0.456295,0.138295,0.055659,1.707893,0.361649,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20233,Q9Y6Z2,LINC01558,0.00,0.129001,0.078636,0.034981,0.043889,0.002359,0.288866,0.609018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
alz_not_na_helper = cummulative_df_alz_not_na.drop(['Entry','Gene'],axis = 1)

In [None]:
scaler = MinMaxScaler()
alz_not_na_helper_scaled = scaler.fit_transform(alz_not_na_helper)
alz_not_na_helper_scaled = pd.DataFrame(alz_not_na_helper_scaled,columns=alz_not_na_helper.columns)

In [None]:
sorted_columns_alz = alz_not_na_helper_scaled.columns.sort_values()
alz_not_na_helper_scaled_result = alz_not_na_helper_scaled[sorted_columns_alz]

In [None]:
alz_not_na_scaled = pd.concat([cummulative_df_alz_not_na[['Entry','Gene']],alz_not_na_helper_scaled_result],axis =1)

In [None]:
alz_not_na_scaled.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Alzheimers/alzheimers_cummulative_notna_scaled.csv')

# Cummulative_Glioma_Glioblastoma

In [None]:
cummulative_df_gg= pd.read_excel('/content/Human_Uniprot to Gene.xlsx')

## BDMC

In [None]:
df_bdmc_g1 = pd.read_csv('/content/drive/MyDrive/BrainProt ML Work/BDMC Data/BDMC_Final_Glioma_D005910.csv')
df_bdmc_g1.rename(columns={'geneName':'Gene'}, inplace=True)
df_bdmc_g1 = df_bdmc_g1[['Gene',"BDMC_Score"]]
df_bdmc_g1.rename(columns={'BDMC_Score':'BDMC_Score_Glioma'}, inplace=True)
df_bdmc_g2 = pd.read_csv('/content/drive/MyDrive/BrainProt ML Work/BDMC Data/BDMC_Final_Glioblastoma_D005909.csv')
df_bdmc_g2.rename(columns={'geneName':'Gene'}, inplace=True)
df_bdmc_g2 = df_bdmc_g2[['Gene',"BDMC_Score"]]
df_bdmc_g2.rename(columns={'BDMC_Score':'BDMC_Score_Glioblastoma'}, inplace=True)

In [None]:
df_gg = pd.merge(df_bdmc_g1, df_bdmc_g2, on='Gene', how='left')

In [None]:
cummulative_df_gg = pd.merge(cummulative_df_gg, df_gg, on = 'Gene', how ='left')

## BDTM

In [None]:
input_files_gg = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDTM Data/3. Glioma and 4. Glioblastoma/*.xlsx'))
metadata_files_gg = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDTM Data/3. Glioma and 4. Glioblastoma/*.csv'))
base_folder_path_gg = '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Glioma and Glioblastoma'

In [None]:
store_results_for_datasets_BDTM(metadata_files_gg, input_files_gg, base_folder_path_gg)

GSE12657
25
h [1.75 1.15 0.1  ... 0.6  0.85 0.85]
g [1.05 0.9  1.25 ... 0.6  0.8  0.85]
[0.         0.34285714 0.94285714 ... 0.65714286 0.51428571 0.51428571] [0.38235294 0.47058824 0.26470588 ... 0.64705882 0.52941176 0.5       ]
GSE13276
15
h [1.3 0.  0.9 ... 0.1 1.3 1.3]
g [0.8 1.  1.1 ... 0.6 0.7 1. ]
[0.27777778 1.         0.5        ... 0.94444444 0.27777778 0.27777778] [0.55555556 0.44444444 0.38888889 ... 0.66666667 0.61111111 0.44444444]


In [None]:
get_cumulative_results_without_FC_BDTM(gse_folders_path=glob.glob("/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Glioma and Glioblastoma/GSE*"), path_to_save = "/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Glioma and Glioblastoma/Cummulative_results")

In [None]:
gg_cummu_res_files_bdtm = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Glioma and Glioblastoma/Cummulative_results/*.csv'))

In [None]:
for i in gg_cummu_res_files_bdtm:
  df = pd.read_csv(i)
  df.drop('Unnamed: 0',axis = 1,inplace = True)
  dataset_name = i.split('/')[-1].split('_cumulative')[0]
  df.columns = [j if idx ==0 else j +'_'+dataset_name for idx,j in enumerate(df.columns)]
  df.rename(columns={'geneName':'Gene'}, inplace=True)
  print(df.columns)
  cummulative_df_gg = pd.merge(cummulative_df_gg, df, on='Gene', how='left')

Index(['Gene', 'ERGS_GSE12657', 'IFSER_GSE12657', 'PCC_GSE12657',
       'RF_GSE12657', 'SVMWV_GSE12657', 'COMB_GSE12657'],
      dtype='object')
Index(['Gene', 'ERGS_GSE13276', 'IFSER_GSE13276', 'PCC_GSE13276',
       'RF_GSE13276', 'SVMWV_GSE13276', 'COMB_GSE13276'],
      dtype='object')


In [None]:
cummulative_df_gg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20426 entries, 0 to 20425
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Entry                    20426 non-null  object 
 1   Gene                     20293 non-null  object 
 2   BDMC_Score_Glioma        20235 non-null  float64
 3   BDMC_Score_Glioblastoma  20235 non-null  float64
 4   ERGS_GSE12657            8620 non-null   float64
 5   IFSER_GSE12657           8620 non-null   float64
 6   PCC_GSE12657             8620 non-null   float64
 7   RF_GSE12657              8620 non-null   float64
 8   SVMWV_GSE12657           8620 non-null   float64
 9   COMB_GSE12657            8620 non-null   float64
 10  ERGS_GSE13276            12299 non-null  float64
 11  IFSER_GSE13276           12299 non-null  float64
 12  PCC_GSE13276             12299 non-null  float64
 13  RF_GSE13276              12299 non-null  float64
 14  SVMWV_GSE13276        

## BDPM

In [None]:
gg_bdpm_files = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDPM Data/3. Glioma and 4. Glioblastoma/*.csv'))
base_folder_path_gg = '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Glioma and Glioblastoma'

In [None]:
store_results_for_datasets_BDPM(gg_bdpm_files, base_folder_path_gg)

PXD014606_Processed
Index(['proteinId', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9',
       'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19',
       'S20', 'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29',
       'S30', 'S31', 'S32', 'S33', 'S34', 'S35', 'S36', 'S37', 'S38', 'S39',
       'S40', 'S41', 'S42', 'S43', 'S44', 'S45', 'S46', 'S47', 'S48'],
      dtype='object')
48
h [1.70833333 1.16666667 1.5        ... 0.70833333 0.33333333 0.66666667]
g [0.83333333 1.41666667 1.41666667 ... 1.20833333 0.875      1.29166667]
[0.08888889 0.37777778 0.2        ... 0.62222222 0.82222222 0.64444444] [0.53488372 0.20930233 0.20930233 ... 0.3255814  0.51162791 0.27906977]
PXD028931_Processed
Index(['proteinId', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9',
       'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19',
       'S20', 'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29',
       'S30', 'S31', 'S32', 'S33', 'S34

In [None]:
get_cumulative_results_without_FC_BDPM(gse_folders_path=glob.glob("/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Glioma and Glioblastoma/PXD*"), path_to_save = "/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Glioma and Glioblastoma/Cummulative_results")

In [None]:
gg_cummu_res_files = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Glioma and Glioblastoma/Cummulative_results/*.csv'))

In [None]:
gg_cummu_res_files

['/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Glioma and Glioblastoma/Cummulative_results/PXD014606_Processed_cumulative_results.csv',
 '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Glioma and Glioblastoma/Cummulative_results/PXD028931_Processed_cumulative_results.csv']

In [None]:
for i in gg_cummu_res_files:
  df = pd.read_csv(i)
  df.drop('Unnamed: 0',axis = 1,inplace = True)
  dataset_name = i.split('/')[-1].split('_Processed')[0]
  print(dataset_name)
  df.columns = [j if idx ==0 else j +'_'+dataset_name for idx,j in enumerate(df.columns)]
  df.rename(columns={'proteinId':'Entry'}, inplace=True)
  print(df.columns)
  cummulative_df_gg = pd.merge(cummulative_df_gg, df, on='Entry', how='left')

PXD014606
Index(['Entry', 'ERGS_PXD014606', 'IFSER_PXD014606', 'PCC_PXD014606',
       'RF_PXD014606', 'SVMWV_PXD014606', 'COMB_PXD014606'],
      dtype='object')
PXD028931
Index(['Entry', 'ERGS_PXD028931', 'IFSER_PXD028931', 'PCC_PXD028931',
       'RF_PXD028931', 'SVMWV_PXD028931', 'COMB_PXD028931'],
      dtype='object')


## Saving_data

In [None]:
cummulative_df_gg.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Glioma_Glioblastoma/Glioma_Glioblastoma_cummulative.csv')

In [None]:
cummulative_df_gg_not_na = cummulative_df_gg[cummulative_df_gg['BDMC_Score_Glioma'].notna()]
cummulative_df_gg_not_na = cummulative_df_gg_not_na[cummulative_df_gg_not_na['BDMC_Score_Glioblastoma'].notna()]
cummulative_df_gg_not_na = cummulative_df_gg_not_na.reset_index(drop=True)
cummulative_df_gg_not_na['BDMC_Score_Glioma'] = np.maximum(cummulative_df_gg_not_na['BDMC_Score_Glioma'], 0)
cummulative_df_gg_not_na['BDMC_Score_Glioblastoma'] = np.maximum(cummulative_df_gg_not_na['BDMC_Score_Glioblastoma'], 0)

In [None]:
cummulative_df_gg_not_na.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Glioma_Glioblastoma/Glioma_Glioblastoma_cummulative_notna.csv')

In [None]:
cummulative_df_gg_not_na.fillna(0, inplace=True)

In [None]:
gg_not_na_helper = cummulative_df_gg_not_na.drop(['Entry','Gene'],axis = 1)

In [None]:
scaler = MinMaxScaler()
gg_not_na_helper_scaled = scaler.fit_transform(gg_not_na_helper)
gg_not_na_helper_scaled = pd.DataFrame(gg_not_na_helper_scaled,columns=gg_not_na_helper.columns)

In [None]:
sorted_columns_gg = gg_not_na_helper_scaled.columns.sort_values()
gg_not_na_helper_scaled_result = gg_not_na_helper_scaled[sorted_columns_gg]

In [None]:
gg_not_na_scaled = pd.concat([cummulative_df_gg_not_na[['Entry','Gene']],gg_not_na_helper_scaled_result],axis =1)

In [None]:
gg_not_na_scaled.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Glioma_Glioblastoma/Glioma_Glioblastoma_cummulative_notna_scaled.csv')

# Cummulative_Meningioma

In [None]:
cummulative_df_menin = pd.read_excel('/content/Human_Uniprot to Gene.xlsx')

## BDMC

In [None]:
df_bdmc_menin = pd.read_csv('/content/drive/MyDrive/BrainProt ML Work/BDMC Data/BDMC_Final_Meningioma_D008579.csv')
df_bdmc_menin.rename(columns={'geneName':'Gene'}, inplace=True)
df_bdmc_menin = df_bdmc_menin[['Gene',"BDMC_Score"]]

In [None]:
cummulative_df_menin = pd.merge(cummulative_df_menin, df_bdmc_menin, on='Gene', how='left')
cummulative_df_menin.head()

Unnamed: 0,Entry,Gene,BDMC_Score
0,A0A087X1C5,CYP2D7,0.0
1,A0A0B4J2F0,PIGBOS1,0.0
2,A0A0B4J2F2,SIK1B,0.0
3,A0A0C5B5G6,MT-RNR1,0.0
4,A0A0K2S4Q6,CD300H,0.0


## BDPM

In [None]:
menin_bdpm_files = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDPM Data/1. Meningioma/*.csv'))
base_folder_path_menin = '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Meningioma'

In [None]:
store_results_for_datasets_BDPM(menin_bdpm_files, base_folder_path_menin)

PXD007073_Processed
Index(['proteinId', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9',
       'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19',
       'S20', 'S21', 'S22', 'S23', 'S24', 'S25'],
      dtype='object')
25
h [1.3030303  0.         0.96969697 ... 1.43939394 1.25757576 0.42424242]
g [0.09090909 0.27272727 0.27272727 ... 0.6969697  0.04545455 0.46969697]
[0.26495726 1.         0.45299145 ... 0.18803419 0.29059829 0.76068376] [0.94871795 0.84615385 0.84615385 ... 0.60683761 0.97435897 0.73504274]
PXD014852_Processed
Index(['proteinId', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9',
       'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19',
       'S20', 'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29'],
      dtype='object')
29
h [1.19642857 0.23809524 0.         ... 0.92857143 1.33928571 1.07142857]
g [0.45833333 0.45833333 0.69642857 ... 0.78571429 0.92857143 1.43452381]
[0.30927835 0.86254296 1.         ... 0.

In [None]:
get_cumulative_results_without_FC_BDPM(gse_folders_path=glob.glob("/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Meningioma/PXD*"), path_to_save = "/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Meningioma/Cummulative_results")

In [None]:
menin_cummu_res_files = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDPM Analysis/Meningioma/Cummulative_results/*.csv'))

In [None]:
for i in menin_cummu_res_files:
  df = pd.read_csv(i)
  df.drop('Unnamed: 0',axis = 1,inplace = True)
  dataset_name = i.split('/')[-1].split('_Processed')[0]
  print(dataset_name)
  df.columns = [j if idx ==0 else j +'_'+dataset_name for idx,j in enumerate(df.columns)]
  df.rename(columns={'proteinId':'Entry'}, inplace=True)
  print(df.columns)
  cummulative_df_menin = pd.merge(cummulative_df_menin, df, on='Entry', how='left')

PXD007073
Index(['Entry', 'ERGS_PXD007073', 'IFSER_PXD007073', 'PCC_PXD007073',
       'RF_PXD007073', 'SVMWV_PXD007073', 'COMB_PXD007073'],
      dtype='object')
PXD014852
Index(['Entry', 'ERGS_PXD014852', 'IFSER_PXD014852', 'PCC_PXD014852',
       'RF_PXD014852', 'SVMWV_PXD014852', 'COMB_PXD014852'],
      dtype='object')


In [None]:
cummulative_df_menin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20426 entries, 0 to 20425
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Entry            20426 non-null  object 
 1   Gene             20293 non-null  object 
 2   BDMC_Score       20235 non-null  float64
 3   ERGS_PXD007073   2309 non-null   float64
 4   IFSER_PXD007073  2309 non-null   float64
 5   PCC_PXD007073    2309 non-null   float64
 6   RF_PXD007073     2309 non-null   float64
 7   SVMWV_PXD007073  2309 non-null   float64
 8   COMB_PXD007073   2309 non-null   float64
 9   ERGS_PXD014852   2395 non-null   float64
 10  IFSER_PXD014852  2395 non-null   float64
 11  PCC_PXD014852    2395 non-null   float64
 12  RF_PXD014852     2395 non-null   float64
 13  SVMWV_PXD014852  2395 non-null   float64
 14  COMB_PXD014852   2395 non-null   float64
dtypes: float64(13), object(2)
memory usage: 2.5+ MB


## BDTM

In [None]:
input_files_menin = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDTM Data/1. Meningioma/*.xlsx'))
metadata_files_menin = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDTM Data/1. Meningioma/*.csv'))
base_folder_path_menin = '/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Meningioma'

In [None]:
store_results_for_datasets_BDTM(metadata_files_menin, input_files_menin, base_folder_path_menin)

GSE43290
51
h [0.19148936 0.31382979 0.69148936 ... 0.94680851 0.08510638 0.71276596]
g [0.19148936 0.42021277 0.37765957 ... 0.35638298 0.06382979 0.39893617]
[0.88461538 0.81089744 0.58333333 ... 0.42948718 0.94871795 0.57051282] [0.8902439  0.75914634 0.78353659 ... 0.79573171 0.96341463 0.77134146]
GSE54934
25
h [0.56060606 0.09090909 0.         ... 1.07575758 1.12121212 1.25757576]
g [0.37878788 1.40909091 0.18181818 ... 0.6969697  0.09090909 0.13636364]
[0.69918699 0.95121951 1.         ... 0.42276423 0.39837398 0.32520325] [0.79166667 0.225      0.9        ... 0.61666667 0.95       0.925     ]


In [None]:
get_cumulative_results_without_FC_BDTM(gse_folders_path=glob.glob("/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Meningioma/GSE*"), path_to_save = "/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Meningioma/Cummulative_results")

In [None]:
menin_cummu_res_files_bdtm = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/BDTM Analysis/Meningioma/Cummulative_results/*.csv'))

In [None]:
for i in menin_cummu_res_files_bdtm:
  df = pd.read_csv(i)
  df.drop('Unnamed: 0',axis = 1,inplace = True)
  dataset_name = i.split('/')[-1].split('_cumulative')[0]
  df.columns = [j if idx ==0 else j +'_'+dataset_name for idx,j in enumerate(df.columns)]
  df.rename(columns={'geneName':'Gene'}, inplace=True)
  print(df.columns)
  cummulative_df_menin = pd.merge(cummulative_df_menin, df, on='Gene', how='left')

Index(['Gene', 'ERGS_GSE43290', 'IFSER_GSE43290', 'PCC_GSE43290',
       'RF_GSE43290', 'SVMWV_GSE43290', 'COMB_GSE43290'],
      dtype='object')
Index(['Gene', 'ERGS_GSE54934', 'IFSER_GSE54934', 'PCC_GSE54934',
       'RF_GSE54934', 'SVMWV_GSE54934', 'COMB_GSE54934'],
      dtype='object')


In [None]:
cummulative_df_menin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20633 entries, 0 to 20632
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Entry            20633 non-null  object 
 1   Gene             20500 non-null  object 
 2   BDMC_Score       20442 non-null  float64
 3   ERGS_PXD007073   2344 non-null   float64
 4   IFSER_PXD007073  2344 non-null   float64
 5   PCC_PXD007073    2344 non-null   float64
 6   RF_PXD007073     2344 non-null   float64
 7   SVMWV_PXD007073  2344 non-null   float64
 8   COMB_PXD007073   2344 non-null   float64
 9   ERGS_PXD014852   2426 non-null   float64
 10  IFSER_PXD014852  2426 non-null   float64
 11  PCC_PXD014852    2426 non-null   float64
 12  RF_PXD014852     2426 non-null   float64
 13  SVMWV_PXD014852  2426 non-null   float64
 14  COMB_PXD014852   2426 non-null   float64
 15  ERGS_GSE43290    12416 non-null  float64
 16  IFSER_GSE43290   12416 non-null  float64
 17  PCC_GSE43290

## Saving Data

In [None]:
cummulative_df_menin.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Meningioma/meningioma_cummulative.csv')

In [None]:
cummulative_df_menin_not_na = cummulative_df_menin[cummulative_df_menin['BDMC_Score'].notna()]
cummulative_df_menin_not_na = cummulative_df_menin_not_na.reset_index(drop=True)
cummulative_df_menin_not_na['BDMC_Score'] = np.maximum(cummulative_df_menin_not_na['BDMC_Score'], 0)

In [None]:
cummulative_df_menin_not_na.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Meningioma/meningioma_cummulative_notna.csv')

In [None]:
cummulative_df_menin_not_na.fillna(0, inplace=True)

In [None]:
cummulative_df_menin_not_na

Unnamed: 0,Entry,Gene,BDMC_Score,ERGS_PXD007073,IFSER_PXD007073,PCC_PXD007073,RF_PXD007073,SVMWV_PXD007073,COMB_PXD007073,ERGS_PXD014852,...,PCC_GSE43290,RF_GSE43290,SVMWV_GSE43290,COMB_GSE43290,ERGS_GSE54934,IFSER_GSE54934,PCC_GSE54934,RF_GSE54934,SVMWV_GSE54934,COMB_GSE54934
0,A0A087X1C5,CYP2D7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.178875,0.070669,0.129929,1.895016,0.519775,0.413565,0.084657,0.044127,0.157865,1.219988
1,A0A087X1C5,CYP2D7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.178875,0.070669,0.129929,1.895016,0.362178,0.431617,0.105446,0.170885,0.046500,1.116626
2,A0A0B4J2F0,PIGBOS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,A0A0B4J2F2,SIK1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,A0A0C5B5G6,MT-RNR1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20437,Q9UI54,PRO0628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20438,Q9UI72,PRO0255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20439,Q9Y6C7,LINC00312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.508308,0.364497,0.361919,2.630885,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
20440,Q9Y6Z2,LINC01558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.067810,0.081698,0.023065,0.969941,0.924510,0.676844,0.238989,0.055408,0.178936,2.074687


In [None]:
menin_not_na_helper = cummulative_df_menin_not_na.drop(['Entry','Gene'],axis = 1)

In [None]:
scaler = MinMaxScaler()
menin_not_na_helper_scaled = scaler.fit_transform(menin_not_na_helper)
menin_not_na_helper_scaled = pd.DataFrame(menin_not_na_helper_scaled,columns=menin_not_na_helper.columns)

In [None]:
sorted_columns_menin = menin_not_na_helper_scaled.columns.sort_values()
menin_not_na_helper_scaled_result = menin_not_na_helper_scaled[sorted_columns_menin]

In [None]:
menin_not_na_scaled = pd.concat([cummulative_df_menin_not_na[['Entry','Gene']],menin_not_na_helper_scaled_result],axis =1)

In [None]:
menin_not_na_scaled.to_csv('/content/drive/MyDrive/BrainProt ML Work/Analysed Data/Cummulative_Meningioma/meningioma_cummulative_notna_scaled.csv')

# Analysis

In [None]:
!pip install venn
import venn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def show_venn_diagram(path, method_name, fig_path, num_top_genes=300):
    gene_list = []
    for csv_file in glob.glob(path):
        df = pd.read_csv(csv_file)
        sorted_df = df[['geneName', method_name]].sort_values(by=method_name, ascending=False).head(num_top_genes)
        gene_names = sorted_df['geneName'].tolist()
        gene_list.append(gene_names)
    dataset_names = [p.split('/')[-1].split('_')[0] for p in glob.glob(path)]
    s1, s2, s3, s4, s5, s6 = list(map(set, gene_list))
    common_genes = list(s1.intersection(s2, s3, s4, s5, s6))
    print(f'Common gene names: {common_genes}')
    labels = venn.get_labels([s1, s2, s3, s4, s5, s6], fill=['number'])
    fig, ax = venn.venn6(labels, names=dataset_names)
    ax.set_title(method_name, fontsize = 22)
    try:
            os.makedirs(fig_path)
    except FileExistsError:
            # directory already exists
            pass
    fig.savefig(os.path.join(fig_path, method_name + '-' + str(num_top_genes) +'.png'))
    fig.show()

In [None]:
import warnings
warnings.filterwarnings('ignore')

method = ['EGRS','RF','GR','SVMWV','COMB','IFSER','IG', 'PCC']

gse_folder = '/content/drive/MyDrive/DDP/Venn/Equal Features/GSE*'
fig_path = '/content/drive/MyDrive/DDP/Venn/Equal Features/Figures'
for i in method:
    print(i)
    show_venn_diagram(gse_folder,i, fig_path, num_top_genes=2000)

# Data for ML Modeling_Parkinsons

In [None]:
import copy

In [None]:
input_files_bdtm_park  = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDTM Data/11. Parkinson/*.xlsx'))
metadata_files_bdtm_park = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDTM Data/11. Parkinson/*.csv'))
base_folder_path_bdtm = '/content/drive/MyDrive/BrainProt ML Work/Advait_data/BDTM_25'

In [None]:
top_genes = [
    "DNAJC6", "SYNJ1", "HTT", "CDK5", "SLC17A6", "SNCA", "MFN2",
    "DNAJB1", "UCHL1", "SNCG", "TH", "HSPA9", "DNM1L", "FTL", "GAD1",
    "EIF4G1", "COMT", "ENO2", "GPX4", "APOE", "SIRT2", "HSPA8",
    "NQO1", "HTRA2", "HIP1R"
]

In [None]:
def store_top_gene_BDTM(input_files, metadata_files, base_folder_path,top_genes):
  for i in range(len(input_files)):
    columns = copy.copy(top_genes)
    input_df = pd.read_excel(input_files[i])
    metadata_df = pd.read_csv(metadata_files[i])
    folder_name = input_files[i].split("/")[-1].split("Input")[0].strip("_").strip()
    print(folder_name)
    folder_path = base_folder_path + '/' + folder_name
    try:
      os.makedirs(folder_path)
    except FileExistsError:
      # directory already exists
      pass
    df = create_data(metadata_df, input_df)
    processed_df = processed_data(df)
    processed_df.iloc[0,0] = "Target"
    processed_df.columns = processed_df.iloc[0,:]
    processed_df = processed_df.iloc[1:,:]
    columns.append("Target")
    columns_df = processed_df.columns
    common_cols = [val for val in columns if val in columns_df]
    final_df = processed_df[common_cols]
    final_df.to_csv(folder_path + '/' + folder_name+ '_top_gene.csv')

In [None]:
store_top_gene_BDTM(input_files_bdtm_park, metadata_files_bdtm_park, base_folder_path_bdtm,top_genes)

GSE19587
GSE20141
GSE20146
GSE20163
GSE20164
GSE20168
GSE20291
GSE20292
GSE20314
GSE20333
GSE24378
GSE7621
GSE8397_U133A
GSE8397_U133B


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_transposed.rename(index={'geneName':'SampleId'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_df.iloc[0,0] = "Target"


In [None]:
input_files_bdpm_park = sorted(glob.glob('/content/drive/MyDrive/BrainProt ML Work/BDPM Data/11. Parkinson/*csv'))
base_folder_path_bdpm = '/content/drive/MyDrive/BrainProt ML Work/Advait_data/BDPM_25'

In [None]:
top_proteins = [
    "O75061", "O43426", "P42858", "Q00535", "Q9P2U8", "P37840", "O95140",
    "P25685", "P09936", "O76070", "P07101", "P38646", "O00429", "P02792",
    "Q99259", "Q04637", "P21964", "P09104", "P36969", "P02649", "Q8IXJ6",
    "P11142", "P15559", "O43464", "O75146"
]

In [None]:
def store_top_protein_bdpm(input_files,base_folder_path,top_proteins):
   for i in range(len(input_files)):
    columns = copy.copy(top_proteins)
    data = pd.read_csv(input_files[i])
    folder_name = input_files[i].split("/")[-1].split("Input")[0].strip("_Group.csv").strip()
    print(folder_name)
    folder_path = base_folder_path + '/' + folder_name
    print(data.columns)
    try:
      os.makedirs(folder_path)
    except FileExistsError:
      # directory already exists
      pass
     # Extract and encode labels
    labels = data.iloc[0, 1:].map({'Control': 0, 'Disease': 1})
    data_processed = data.drop(0).set_index('proteinId').T
    data_processed['Target'] = labels.values
    data_processed = data_processed.apply(pd.to_numeric, errors='coerce')
    # Rename index to "SampleID"
    data_processed.index.name = "SampleID"
    columns.append("Target")
    columns_df = data_processed.columns
    common_cols = [val for val in columns if val in columns_df]
    final_df = data_processed[common_cols]
    final_df.to_csv(folder_path + '/' + folder_name+ '_top_proteins.csv')

In [None]:
store_top_protein_bdpm(input_files_bdpm_park,base_folder_path_bdpm,top_proteins)

PXD008036_Processed
Index(['proteinId', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9',
       'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19',
       'S20', 'S21', 'S22', 'S23', 'S24'],
      dtype='object')
PXD022092_Processed
Index(['proteinId', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9',
       'S10'],
      dtype='object')
PXD024998_Processed
Index(['proteinId', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6'], dtype='object')
