<a href="https://colab.research.google.com/github/ahmedhesham47/Machine-Learning-Integrative-Framework-for-Predicting-ICB-Response/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Importing Packages**

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from sklearn.feature_selection import f_classif
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.pipeline import make_pipeline
from itertools import combinations
from functools import reduce
from sklearn.model_selection import GridSearchCV
import itertools
from scipy import stats
from sklearn.svm import SVC

## **Helper Functions**

In [4]:
'''
A function that pre-processes gene expression data.
It takes as an input a gene expression dataframe, some other parameter, and returns a filtered, pre-processed gene expression dataframe.
'''

def GeneExpressionPreprocessing(mRNAFile, GeneColumnName, Need_Transpose=True, Percent_To_FilterOut=20):
  if Need_Transpose == True: # transpose the file such that genes are columns and samples are rows
    mRNAFile = mRNAFile.set_index(GeneColumnName)
    mRNAFile = mRNAFile.T
    mRNAFile = mRNAFile.reset_index()
    new_columns = {'index': 'Sample'}
    mRNAFile = mRNAFile.rename(columns=new_columns)
    mRNAFile = mRNAFile.rename_axis('index', axis=1)
  # Filter out genes which are zeroes in certain percentage from the data.
  filtered_data = mRNAFile.loc[:, (mRNAFile == 0).mean() * 100 <= Percent_To_FilterOut]
  return filtered_data

In [5]:
'''
A function that merges columns based on a common column.
It takes as an input a list of all the dataframes that should be merged and the merging column
This function is very important as we will use it often later
'''

def merge_on_common_column(dataframes, merge_column):
    merged_df = dataframes[0]
    for df in dataframes[1:]:
        merged_df = pd.merge(merged_df, df, on=merge_column)
    return merged_df

In [6]:
'''
A function that merges more than two dataframes.
'''

def merge_dataframes(dataframeslist, commoncolumn):
  return reduce(lambda left, right: pd.merge(left, right, on=commoncolumn, how='inner'), dataframeslist)

In [47]:
'''
A function to get all best combinations from genes selected via feature selection methods.
It takes as an input a merged dataframe with all the selected genes and build all the combinations from 5 selection methods.
It returns "Final_selected_genes", which is a list of the genes coming from the combination that leads to the maximum number of genes.
'''

def Best_Combinations(merged):
  column_names = merged.columns
  column_combinations = combinations(column_names, 5)
  intersections = {}
  # Compute the intersection for each combination
  for combination in column_combinations:
      # Drop NaN values and find the intersections from that particular combination
      intersected_genes = set(merged[combination[0]].dropna()) & set(merged[combination[1]].dropna()) & \
                          set(merged[combination[2]].dropna()) & set(merged[combination[3]].dropna()) & \
                          set(merged[combination[4]].dropna())
      intersections[combination] = intersected_genes
  intersections_summary = {combo: len(genes) for combo, genes in intersections.items()} # this is not needed, it can only be used to see what each combination produced
  max_combination = max(intersections, key=lambda x: len(intersections[x])) # the combination leading to the maximum number of genes
  max_genes = intersections[max_combination] # the genes themselves
  Final_Selected_Genes = list(max_genes) # making sures they are in a list
  return Final_Selected_Genes

In [8]:
'''
A function to get a merged dataframe from all the feature selection methods
This is important because it will be passed to the function just explained above
It returns a merged dataframe.
'''

def get_merged_df(rf, fdr, cor, sfm, gb, rfe):
  #fdr.rename(columns={'Feature': 'Gene'}, inplace=True)

  rf_top_genes = rf['Feature']

  sfm_selected_genes = sfm['Feature']

  correlation_top_genes = cor['Feature']

  gb_top_genes = gb['Feature']

  fdr_top_genes = fdr['Feature']

  rfe_selected_genes = rfe['Feature']

  all_genes = set(rf_top_genes) | set(sfm_selected_genes) | set(correlation_top_genes) | set(gb_top_genes) | set(fdr_top_genes) | set(rfe_selected_genes)
  all_genes_list = list(all_genes)
  # In sets, the "|" means union, so we get all the genes from all the feature selection methods

  col = ['Random_Forest', 'Select_From_Model', 'Gene_Response_Correlation', 'Gradient_Boosting', 'Fisher_Discriminant_Ratio', 'Recursive_Feature_Elimination']
  merged_df = pd.DataFrame(index=all_genes_list, columns=col) # this is an empty dataframe with rows as ALL the genes and columns as the feature selection methods

  # We will now fill the dataframe with either a value (name of the gene) or NaN, which means the gene was not selected via this feature selection method
  # For example...
  # Random_Forest     Select_From_Model"
  # TP53              NaN
  # And so on...

  merged_df.loc[rf_top_genes, "Random_Forest"] = rf_top_genes.values
  merged_df.loc[sfm_selected_genes, "Select_From_Model"] = sfm_selected_genes.values
  merged_df.loc[correlation_top_genes, "Gene_Response_Correlation"] = correlation_top_genes.values
  merged_df.loc[gb_top_genes, "Gradient_Boosting"] = gb_top_genes.values
  merged_df.loc[fdr_top_genes, "Fisher_Discriminant_Ratio"] = fdr_top_genes.values
  merged_df.loc[rfe_selected_genes, "Recursive_Feature_Elimination"] = rfe_selected_genes.values
  return merged_df

## **Importing & PreProcessing**



In [9]:
# Reading the raw data from the drive...
Raw_Normalized_Gene_Expression_Data = pd.read_csv('/content/drive/MyDrive/Magdi Yakoub Internship/R normalization/HTSeq_combined_counts_normalized.csv', sep=',')

In [10]:
# Processing gene expression data...
new_columns = {'Unnamed: 0': 'GeneID'}
Raw_Normalized_Gene_Expression_Data = Raw_Normalized_Gene_Expression_Data.rename(columns=new_columns)
Processed_Gene_Expression_Data = GeneExpressionPreprocessing(Raw_Normalized_Gene_Expression_Data, GeneColumnName="GeneID")

In [11]:
Processed_Gene_Expression_Data

index,Sample,MYH7,MYL2,DES,ANKRD1,ACTA1,ACTC1,TNNI3,MB,TTN,...,CACNA1A,TMEM86B,NKAPP1,SPDYE3,SLC6A16,FAM78B,NPAS3,YWHAEP1,RAD51B,GNB1L
0,SRR24302636,25141.164467,24974.93602,18950.532917,11316.151571,13198.501923,12403.925047,8020.369434,11928.269145,10997.659337,...,1.102473,1.531213,0.734982,1.837455,1.22497,1.163722,1.592461,1.22497,0.857479,1.469964
1,SRR24302631,34506.417656,20837.226101,17466.24879,10672.374747,8462.234967,11831.635139,5949.158292,12699.503294,11729.436519,...,0.820112,1.198626,1.072455,1.387882,0.946284,1.072455,1.387882,1.514054,0.757027,1.450968
2,SRR24302634,29442.339877,25530.233999,18468.033885,16037.041087,9860.731787,11386.92986,12195.532116,15522.453203,12307.282671,...,0.669165,2.258432,1.254684,2.593014,1.087393,1.003747,1.33833,1.505621,1.33833,1.421976
3,SRR24302641,28630.304754,26490.392335,17139.935036,11956.902373,8783.240669,11770.123016,7697.562036,16413.755643,10778.400976,...,1.209417,1.814126,1.285006,2.040891,0.831474,0.755886,1.360594,1.511771,1.360594,1.285006
4,SRR24302632,27105.263042,20690.782894,12887.683283,8774.703152,6898.159068,10942.215505,7114.248636,14957.709963,10278.838727,...,1.04764,0.992501,1.04764,1.102779,0.992501,0.827084,1.433613,1.488752,1.213057,1.213057
5,SRR24302643,24100.798888,14158.009803,13985.421405,11846.960715,8786.583108,11470.49995,5122.827542,11199.227663,10687.47147,...,1.362866,1.796505,1.053124,1.672608,1.238969,1.053124,1.486763,1.238969,1.177021,1.362866
6,SRR24302642,36691.897511,25767.188398,15179.006087,5232.82855,4892.506048,8726.087586,9675.695923,16344.362039,10189.861506,...,0.986882,1.290538,0.683226,1.821936,1.594194,1.062796,1.51828,1.594194,1.214624,1.442366
7,SRR24302639,33228.819578,22826.619356,19751.912885,13244.034586,10030.262926,13360.998342,4932.84088,11913.727065,10730.182984,...,1.241653,1.303736,0.744992,1.427901,1.241653,1.365819,1.365819,1.427901,1.055405,1.117488
8,SRR24302635,33218.719609,21983.072651,17909.892904,9954.551423,10179.932741,13519.049311,6577.084354,10369.376862,10700.995883,...,0.891872,1.20665,0.944335,1.626355,0.629557,1.049261,1.049261,0.786946,0.786946,1.049261
9,SRR24302640,22973.278472,13000.144723,15471.900973,8646.644124,5527.453277,10447.440177,2597.124925,9622.57703,10678.598158,...,0.657705,1.214224,1.113039,1.416595,0.75889,0.708298,1.163632,1.264817,0.961261,0.961261


In [12]:
data = {
    'Sample': ["SRR24302636", "SRR24302631", "SRR24302634", "SRR24302641", "SRR24302632", "SRR24302643", "SRR24302642", "SRR24302639", "SRR24302635", "SRR24302640", "SRR24302633"],
    'Condition': [1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0]
}

# Create DataFrame
Label_Gene_Expression_Data = pd.DataFrame(data)


In [13]:
Label_Gene_Expression_Data

Unnamed: 0,Sample,Condition
0,SRR24302636,1.0
1,SRR24302631,0.0
2,SRR24302634,1.0
3,SRR24302641,0.0
4,SRR24302632,1.0
5,SRR24302643,1.0
6,SRR24302642,0.0
7,SRR24302639,0.0
8,SRR24302635,1.0
9,SRR24302640,1.0


# **Feature Selection**

## **Random Forest**

In [14]:
def feature_selection_with_rf(df, label_df, commoncolumn='Sample', labelID ='Condition', n_feature=1000, n_estimators=100, random_state=42):
    merged_df = pd.merge(df, label_df, on=commoncolumn)
    X = merged_df.drop(columns=[commoncolumn, labelID])
    y = merged_df[labelID]
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    rf.fit(X, y)
    rf_importances = rf.feature_importances_
    rf_importances_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_importances
    })
    rf_importances_df = rf_importances_df.sort_values(by='Importance', ascending=False)
    rf_importances_df = rf_importances_df.head(n_feature)
    rf_importances_df = rf_importances_df[rf_importances_df['Importance'] > 0]
    return rf_importances_df

In [15]:
rf_Expression = feature_selection_with_rf(Processed_Gene_Expression_Data, Label_Gene_Expression_Data, 'Sample', 'Condition', n_estimators=1000)

In [16]:
rf_Expression

Unnamed: 0,Feature,Importance
10812,CCDC18,0.006012
1646,RNF103,0.005010
5185,AIF1L,0.005010
10135,MB21D2,0.004008
1367,CALCOCO2,0.004008
...,...,...
4600,MTHFR,0.000306
4027,PSMC4,0.000306
826,LRRFIP1,0.000306
615,GBAS,0.000306


## **Recursive Feature Elimination**

In [17]:
def recursive_feature_elimination(df, labels_df, commoncolumn='Sample', labelID ='Condition', n_features_to_select=1000, step=100):
    merged_df = pd.merge(df, labels_df, on=commoncolumn)
    X = merged_df.drop([commoncolumn, labelID], axis=1)
    y = merged_df[labelID]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    logreg = LogisticRegression()
    rfe = RFE(estimator=logreg, n_features_to_select=n_features_to_select, step=step)
    rfe.fit(X_scaled, y)
    selected_features = pd.DataFrame({'Feature': X.columns,
                                      'Selected': rfe.support_,
                                      'Ranking': rfe.ranking_})
    s = selected_features.sort_values(by='Ranking', ascending=True)

    # Selecting genes with ranking = 1, that is the top n_features_to_select genes
    s = s[s['Ranking'] == 1]
    return s

In [18]:
rfe_Expression = recursive_feature_elimination(Processed_Gene_Expression_Data, Label_Gene_Expression_Data, 'Sample', 'Condition', step=50)

In [19]:
rfe_Expression

Unnamed: 0,Feature,Selected,Ranking
6925,ATP6V1A,True,1
2786,PPP1R13B,True,1
11394,CDNF,True,1
6816,SUPV3L1,True,1
1856,RNF150,True,1
...,...,...,...
3365,FAM110B,True,1
1672,KIAA1191,True,1
12123,GDPD2,True,1
9292,SPECC1,True,1


## **SelectFromModel**

In [21]:
def select_from_model(df, labels_df, commoncolumn='Sample', labelID ='Condition'):
# Selecting features using Logistic Regression and SelectFromModel
  merged_df = pd.merge(df, labels_df, on=commoncolumn)
  X = merged_df.drop([commoncolumn, labelID], axis=1)
  y = merged_df[labelID]
  log_reg = LogisticRegression()
  select_model = SelectFromModel(log_reg)
  select_model.fit(X, y)
  selection_status = select_model.get_support()
  SelectFromModel_genes_df = pd.DataFrame({
      'Feature': X.columns,
      'Selected': selection_status
  })
  SelectFromModel_genes_df['Selected'] = SelectFromModel_genes_df['Selected'].astype(int)
  SelectFromModel_genes_df = SelectFromModel_genes_df[SelectFromModel_genes_df['Selected'] == 1]

  return SelectFromModel_genes_df

In [22]:
sfm_Expression = select_from_model(Processed_Gene_Expression_Data, Label_Gene_Expression_Data, 'Sample', 'Condition')

In [23]:
sfm_Expression

Unnamed: 0,Feature,Selected
0,MYH7,1
1,MYL2,1
2,DES,1
3,ANKRD1,1
4,ACTA1,1
...,...,...
6549,COQ2,1
6908,FGF2,1
6986,ST8SIA4,1
7249,PHYKPL,1


## **Correlation**

In [24]:
def correlation(df, labels_df, threshold=0.2, commoncolumn='Sample', labelID ='Condition', is_df_categorical=True):
  merged_df = pd.merge(df, labels_df, on=commoncolumn)
  X = merged_df.drop([commoncolumn, labelID], axis=1)
  y = merged_df[labelID]

  # Was producing an error because Pearson correlation (calculated by corr function) cannot correlate categoricals to floats
  # So we added an else condition to handle the case where the data is not categorical to be calculated via point biserial method
  if is_df_categorical:
    correlation_values = X.apply(lambda gene: gene.corr(y))
  else:
    correlation_values = X.apply(lambda gene: stats.pointbiserialr(gene, y).correlation)


  gene_correlation = pd.DataFrame({
      'Feature': correlation_values.index,
      'Correlation': correlation_values
  })

  gene_correlation = gene_correlation.sort_values(by='Correlation', key=abs, ascending=False)

  selected_genes = gene_correlation[abs(gene_correlation['Correlation']) > threshold]

  return selected_genes

In [25]:
categorical_columns = []
for column in Label_Gene_Expression_Data.columns:
    if pd.api.types.is_categorical_dtype(Label_Gene_Expression_Data[column]):
        categorical_columns.append(column)

if categorical_columns:
    print("Categorical columns:", categorical_columns)
else:
    print("No categorical columns found.")

No categorical columns found.


In [26]:
Expression_correlation = correlation(Processed_Gene_Expression_Data, Label_Gene_Expression_Data, 0.2, is_df_categorical=False)

In [27]:
Expression_correlation

Unnamed: 0,Feature,Correlation
PLEKHA7,PLEKHA7,0.940768
BAIAP2,BAIAP2,-0.935140
LGR4,LGR4,0.925770
AMFR,AMFR,0.913920
BBS12,BBS12,-0.909168
...,...,...
SCOC-AS1,SCOC-AS1,0.200064
SYF2,SYF2,0.200043
ARPC5,ARPC5,-0.200040
STON2,STON2,-0.200026




## **Gradient Boosting**

In [28]:
def gradient_boosting(df, labels_df, commoncolumn='Sample', labelID ='Condition', nofeatures=1000, n_estimator = 100, random_state=42):
  merged_df = pd.merge(df, labels_df, on=commoncolumn)
  X = merged_df.drop([commoncolumn, labelID], axis=1)
  y = merged_df[labelID]
  gb_classifier = GradientBoostingClassifier(n_estimators=n_estimator, random_state=random_state)

  gb_classifier.fit(X, y)

  feature_importances_gb = gb_classifier.feature_importances_

  gene_importances_gb = pd.DataFrame({
      'Feature': X.columns,
      'Importance': feature_importances_gb
  })

  sorted_gene_importances_gb = gene_importances_gb.sort_values(by='Importance', ascending=False)
  sorted_gene_importances_gb = sorted_gene_importances_gb.head(nofeatures)
  sorted_gene_importances_gb = sorted_gene_importances_gb[sorted_gene_importances_gb['Importance'] > 0]
  return sorted_gene_importances_gb


In [29]:
gb_Expression = gradient_boosting(Processed_Gene_Expression_Data, Label_Gene_Expression_Data, commoncolumn='Sample', labelID ='Condition')

In [30]:
gb_Expression

Unnamed: 0,Feature,Importance
5628,PDS5B,1.861471e-01
5081,CHID1,1.508371e-01
9619,RASGRP2,1.230047e-01
1343,COQ9,9.957825e-02
3245,PRPF4B,8.105676e-02
...,...,...
9021,LINC01018,1.766732e-09
9645,U2AF1L4,1.184275e-09
7282,SEMA3F,9.696020e-10
6081,MPP3,5.321284e-10


## **FDR**

In [31]:
def fdr(df, labels_df, commoncolumn='Sample', n_feature=1000, labelID ='Condition'):
  merged_df = pd.merge(df, labels_df, on=commoncolumn)
  X = merged_df.drop([commoncolumn, labelID], axis=1)
  y = merged_df[labelID]

  f_scores, _ = f_classif(X, y)

  fdr_df = pd.DataFrame({'Feature': X.columns, 'Fisher_Score': f_scores})

  fdr_df = fdr_df.sort_values(by='Fisher_Score', ascending=False)

  fdr_df = fdr_df.head(n_feature)

  return fdr_df

In [32]:
fdr_Expression = fdr(Processed_Gene_Expression_Data, Label_Gene_Expression_Data, commoncolumn='Sample', labelID ='Condition')

In [33]:
fdr_Expression

Unnamed: 0,Feature,Fisher_Score
3548,PLEKHA7,69.290674
9709,BAIAP2,62.705951
2981,LGR4,53.959514
2519,AMFR,45.628166
10143,BBS12,42.899262
...,...,...
11292,ACSL6,4.408025
12153,ZBED6CL,4.407911
1385,SPAG9,4.406700
2378,PPARGC1B,4.405839


## **Intersection Selection Methods**

In [35]:

'''
random forest --> rf_Expression
recursive feature elimination --> rfe_Expression
Select from model --> sfm_Expression
correlation --> Expression_correlation
gradient boosting --> gb_Expression
fdr --> fdr_Expression
'''

'\nrandom forest --> rf_Expression\nrecursive feature elimination --> rfe_Expression\nSelect from model --> sfm_Expression\ncorrelation --> Expression_correlation\ngradient boosting --> gb_Expression\nfdr --> fdr_Expression\n'

In [39]:
Merged_Expression = get_merged_df(rf_Expression, fdr_Expression, Expression_correlation, sfm_Expression, gb_Expression, rfe_Expression)

In [None]:
Best_Expression_Genes = Best_Combinations(Merged_Expression)

In [42]:
Best_Expression_Genes

['NET1',
 'KIAA1143',
 'GPT',
 'LSM14A',
 'RAPGEF5',
 'MLLT4',
 'HYAL2',
 'ENO1',
 'NPPA',
 'PTGDS',
 'SEMA5A',
 'COQ2',
 'PROSC',
 'ENTPD6',
 'FAM65B',
 'ZMYM4',
 'SPIN1',
 'PLEKHA7',
 'ADAM10',
 'NES',
 'KCNE1',
 'HADH',
 'RGN',
 'RNF103',
 'SEMA6D',
 'NR3C1',
 'MPP3',
 'COQ9',
 'PRPF4B',
 'AKT1',
 'PROX1',
 'PDE8B',
 'ANKRD33B',
 'RHOBTB1',
 'FNDC3B',
 'CANX',
 'SAE1',
 'HNRNPR',
 'MAP2',
 'GAB1',
 'KCNK1',
 'ARPC1A',
 'PRICKLE1',
 'ARHGAP5',
 'KLHL24',
 'SLC25A23',
 'OAZ1',
 'NAV2',
 'GJA3',
 'SEPT4',
 'PROS1',
 'STT3B',
 'ANTXR2',
 'PERM1',
 'MAP1B',
 'PDGFC',
 'IPO8',
 'MEF2C',
 'PHLDA1',
 'SNX1',
 'EBLN3',
 'BOD1L1',
 'MYBPC3',
 'ARHGAP24',
 'TMEM248',
 'HIST1H1C',
 'ST3GAL3',
 'PLIN2',
 'MMRN2',
 'IMPDH2',
 'SREBF2',
 'FGF2',
 'AKR1B1',
 'FAM198B',
 'SLC2A1',
 'MCCC2',
 'ITGA5',
 'AGPAT1',
 'GOT1',
 'NOTCH4',
 'SSPN',
 'FAM110B',
 'DYRK2',
 'TOB2',
 'ELMOD2',
 'CST3',
 'TNIK',
 'UBE2K',
 'LRRFIP1',
 'CPTP',
 'FAM180A',
 'SLITRK4',
 'HK2',
 'TSC22D3',
 'PON2',
 'DAAM1',
 'MAN2C1

In [36]:
cuffdiff=["CYP4B1", "HOOK1", "SGIP1"    , "CYR61",     "CA14"   ,   "LGR6",      "KCNK1" ,"NPPA" , "NPPB" ,"FNDC5"  ,   "SLC2A1" ,   "DAB1" ,     "GSTM3"   ,  "GJA5"   ,   "NES"      , "ITLN1"   ,"DPT"      , "F5"        ,"GAS5"   ,   "TNNI1"    , "WNT9A"   ,"SRGN"  ,    "PLCE1"   ,"ANKRD2",    "DPYSL4"    ,"CH25H"    ,"BCO2"   ,   "CDKN1C"    ,"SOX6"      ,"PLEKHA7" ,"CHRDL2" ,   "LRRC32"    ,"APOLD1"   , "NXPH4"  ,   "ADGRD1"  ,  "PTPRB"     ,"PHLDA1" ,   "MCF2L" ,"ENOX1" ,"DLK1"      ,"ASPG"   , "THBS1"     ,"BLM"    ,   "GATM" ,"ACSM5"     ,"MT2A"  ,    "SSTR5-AS1", "SLC6A10P"  ,"SMTNL2"   , "ENO3"   ,   "MYL4"    ,"MPP3"      "TMEM100"  , "DOK6"   ,   "CFD"       ,"JUNB"    ,  "KLF2"    ,  "KCNK6"  ,   "AURKC"   ,"TNNT1"   , "FAM179A" ,  "HK2"      , "THNSL2"   ,"INHA"     , "ANKRD23" , "SNORD89" ,  "NR4A2"     ,"GRB14"    , "MSTN"    ,  "STAT4"     ,"EPHA4"    ,       "NOP56"   ,"SPTLC3"   ,"MAP3K7CL"  ,"EVA1C"   ,  "SIK1",      "IGLL5"    , "CECR1"   ,  "CLDN5" ,"APOL4"     ,"SLC6A6"   , "LINC00881" ,"HES1"    ,  "UBA7" ,     "SEMA3G"   , "FAM107A",   "LSAMP"   ,"P3H2"    ,  "APOD"    ,  "SHISA3" ,   "RASL11B" ,  "ANXA3" ,    "ARHGAP24",  "SPP1"     , "NPNT"   ,"PPP2R2C" ,  "GABRA4"    ,"UGT2B4"  ,  "JCHAIN"  ,  "CXCL2"  ,   "COQ2"      ,"DKK2"     , "ETNPPL"   , "HPGD"       ,  "LINC01018", "ANKRD33B" , "MAP1B",   "FGF18",     "C6"        ,"EMB"    ,   "HRAT56"  ,  "ZNF608"   , "ADAM19"  ,  "DUSP1"  ,   "PHACTR1" ,"GLP1R"  ,   "FHL5"     , "GRIK2" ,    "FAM65B"  ,  "CPNE5"     ,"CRIP3"  ,      "AMPH"    ,  "PSPH"    ,  "AZGP1"   ,  "GRM8"   ,   "AKR1B1",    "FAM180A" , "SOX17",     "SFRP1"    ,  "AQP7P3"  ,  "GADD45G" ,  "PLIN2",     "BANCR",     "KLF4"    , "EPB41L4B"  "FAM155B"   ,"TNMD"      ,"ATP1B4"  ,  "GRIA3"   ,  "MID1",      "ACE2"      ,"SLITRK4"]

In [38]:
limmatrend = [ "PLEKHA7"  ,"LGR4"    , "BAIAP2" ,  "AMFR"     ,"PLIN2"   , "TNIK"    , "HDAC9"  ,  "SSPN"    , "TM9SF3"  , "ARHGAP24","BBS12"   , "AGPAT1"  , "EZR"   ]

In [46]:
# Change the lists to get the intersections you want...
intersection = list(set(cuffdiff).intersection(Best_Expression_Genes))
print("Intersection of list1 and list2:", intersection)

Intersection of list1 and list2: ['SLC2A1', 'KCNK1', 'NPPA', 'COQ2', 'GABRA4', 'FAM65B', 'MAP1B', 'PLEKHA7', 'PHLDA1', 'NES', 'GSTM3', 'ADGRD1', 'ARHGAP24', 'FAM180A', 'PLIN2', 'SLITRK4', 'HK2', 'ANKRD33B', 'AKR1B1', 'PTPRB', 'FAM107A']


In [None]:
# Only to save these best genes

best_genes_expression = pd.DataFrame(Best_Expression_Genes, columns=['Gene_Name'])
best_genes_expression.to_csv('Best Expression Genes.tsv', sep='\t', index=False)