In [61]:
import plotly.express as px

import numpy as np 
import pandas as pd
import seaborn as sns

from sklearn import datasets
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.utils import check_random_state
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from scipy.stats import fisher_exact


from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import sklearn.cluster as cluster
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
from hdbscan import HDBSCAN
from IPython.display import Image
import pydotplus
from six import StringIO
from imblearn.under_sampling import RandomUnderSampler
from random import sample
from random import seed

import matplotlib as mpl
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

from umap import UMAP

%matplotlib inline

seed(24)


## 1. Load PSD expression data

In [62]:
rna = pd.read_csv("geneexpression_labelled_region_ALLENcounts.csv")


In [63]:
tfr = pd.read_csv("Allen_labelledcounts_ALLGENES.csv", chunksize=1024, iterator=True)
rna2 = pd.concat(tfr, ignore_index=True)

samples2= rna2['sample_name']
labels2 = rna2['subclass_label']
del rna2['sample_name']
del rna2['subclass_label']


In [64]:
# Only 20% of DG cells where selected on the all transcriptome dataframe due to memory issues. 
# We will select the same cells for PSD genes dataframe

#matching_samples = samples1.isin(samples2)


rna = rna[rna['Row.names'].isin(samples2)]
rna.reset_index()
#labels1= labels1[matching_samples]


samples1= rna['Row.names']
labels1 = rna['subclass_label']

del rna['Row.names']
del rna['subclass_label']


### Exploratory analysis

In [65]:
label= sorted(list(set(labels1)))

print("PSD genes only")
print()
print("Number of genes: ", len(rna.columns))
print("Number of cells: ", len(rna))
print("Number of labels: ", len(labels1))

 
print()
print("Occurrences of classes")
for lab in label:
    print(lab + " : " + str(list(labels1.values).count(lab)))

print()
print("-------------------------------------")
print()
print("All transcriptome")

print()

print("Number of genes: ", len(rna2.columns))
print("Number of cells: ", len(rna2))
print("Number of labels: ", len(labels2))

print()
 
print("Occurrences of classes")
for lab in label:
    print(lab + " : " + str(list(labels2.values).count(lab)))
    


PSD genes only

Number of genes:  4074
Number of cells:  41506
Number of labels:  41506

Occurrences of classes
CA1 : 12767
CA1-ProS : 3355
CA2 : 143
CA3 : 1899
CT SUB : 5414
DG : 11664
NP SUB : 1885
SUB-ProS : 4379

-------------------------------------

All transcriptome

Number of genes:  31053
Number of cells:  41506
Number of labels:  41506

Occurrences of classes
CA1 : 12767
CA1-ProS : 3355
CA2 : 143
CA3 : 1899
CT SUB : 5414
DG : 11664
NP SUB : 1885
SUB-ProS : 4379


# Random Forest models

First, we use GridSearch on PSD dataset to look for the best parameters

In [None]:
max_depth_l = list(range(5,13))
n_estimators_l= [50,100,200]
param_d = dict(max_depth=max_depth_l, n_estimators= n_estimators_l)

X_train,X_test,y_train,y_test = train_test_split(rna, labels1,random_state=24,train_size=0.8)

model_psd =  RandomForestClassifier()
grid_search = GridSearchCV(model_psd, param_d, cv=4)
grid_fit=grid_search.fit(X_train, y_train)


score_pd=pd.DataFrame(grid_search.cv_results_)
scores_2d=[]
i=0
for c in range(len(max_depth_l)):
    scores_mat=[]
    for g in range(len(n_estimators_l)):
        mean_score= grid_search.cv_results_["mean_test_score"][i]
        scores_mat.append(mean_score)
        i+=1
    scores_2d.append(scores_mat)

fig, ax = plt.subplots(figsize=(13,10)) 
sns.heatmap(scores_2d,ax=ax, annot=True)
ax.set_xticklabels(n_estimators_l)
ax.set_yticklabels(max_depth_l)
ax.set_xlabel("N estimators")
ax.set_ylabel("Max Depth")
fig.savefig('../MODELRF_params.svg')
plt.show()

In [None]:
fig.savefig('../MODELRF_params.svg')

Best parametes n_estimators 200 max_depth 12

### Random Forest model (PSD genes) with best parameters

In [None]:
model_psd =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_psd.fit(X_train, y_train)
y_pred_train = model_psd.predict(X_train)
y_pred_test = model_psd.predict(X_test)



In [None]:
accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)
confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print("Test Set")
print("Accuracy: ", accuracy_test)
print(conf_mat_test)

In [None]:
accuracy_test = pd.DataFrame(confusion_matrix_norm,index=label, columns=label)
accuracy_test = accuracy_test.reindex(sorted(accuracy_test.columns), axis=1)

fig, ax = plt.subplots(figsize=(13,10)) 

sns.heatmap(np.array(accuracy_test),ax=ax, annot=True, vmin=0, vmax=1, fmt='.4f', cmap="Oranges", annot_kws={"fontsize":14})
ax.set_xticklabels(label)
ax.set_yticklabels(label)
ax.set_xlabel("Predicted Class")
ax.set_ylabel("True Class")
fig.savefig('../MODELRFPSD_confussion_matrix_test.svg')
plt.show()

### Analysis of feature importances

In [None]:
importances = model_psd.feature_importances_
std = np.std([tree.feature_importances_ for tree in model_psd.estimators_], axis=0)
forest_importances = pd.Series(importances, index=X_train.columns)



#plt.hist(forest_importances)

#print(forest_importances.median())
print(forest_importances.sort_values(ascending=False)[0:20])
forest_importances.sort_values(ascending=False).to_csv("../feats_importances_PSDGENES.csv")
psdmodel_importances= forest_importances.sort_values(ascending=False)


### Graph of cumulative feature importances


In [None]:
psdmodel_cumsum_importances=0
psdmodel_cumsum=[]
totaln= 1000
#totaln= len(psdmodel_importances) 

for i in range(totaln):
    psdmodel_cumsum_importances += psdmodel_importances[i]
    psdmodel_cumsum.append(psdmodel_cumsum_importances)
    
fig, ax = plt.subplots( nrows=1, ncols=1 )  # create figure & 1 axis
ax.plot(list(range(1,totaln+1)), psdmodel_cumsum, color="b", label="Total")
#ax.plot(list(range(1,totaln+1)), lpsd_cumsum, color="g", label="Genes from PSD")
#ax.plot(list(range(1,totaln+1)), lnopsd_cumsum, color="y", label="non-PSD genes" )
#ax.legend()
ax.set_xlabel("Number of genes")
ax.set_ylabel("Feature importance")
#fig.savefig('../importanceS_PSDGENES_1000genes.svg')   # save the figure to file
#plt.close(fig) 
plt.show()

The number of genes where the curve start to saturate is around 500

## Random Forest model with whole transcriptome

We will use the same parameters as before

In [None]:
X_train,X_test,y_train,y_test = train_test_split(rna2, labels2,random_state=24,train_size=0.8)


In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)



In [None]:
label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

In [None]:
accuracy_test = pd.DataFrame(confusion_matrix_norm,index=label, columns=label)
accuracy_test = accuracy_test.reindex(sorted(accuracy_test.columns), axis=1)

fig, ax = plt.subplots(figsize=(13,10)) 


sns.heatmap(np.array(accuracy_test),ax=ax, annot=True, vmin=0, vmax=1, fmt='.4f', cmap="Oranges", annot_kws={"fontsize":14} )
ax.set_xticklabels(label)
ax.set_yticklabels(label)
ax.set_xlabel("Predicted Class")
ax.set_ylabel("True Class")

fig.savefig('../MODELRF_confussion_matrix_test_wholetransc.svg')
plt.show()

In [None]:
importances = model_allgenes.feature_importances_
std = np.std([tree.feature_importances_ for tree in model_allgenes.estimators_], axis=0)
forest_importances = pd.Series(importances, index=X_train.columns)


#plt.hist(forest_importances)

#print(forest_importances.median())
forest_importances.sort_values(ascending=False).to_csv("../feats_moreimportancesRF_ALLGENES.csv")

## Load feature importances (PSD and ALL TRANSCRIPTOME)

In [66]:
feature_importances_psd = pd.read_csv("../feats_importances_PSDGENES.csv")
feature_importances_alltc= pd.read_csv("../feats_importances_ALLGENES.csv")
non_PSD_genes = list(set(feature_importances_psd['Unnamed: 0'].values) - set(feature_importances_alltc['Unnamed: 0'].values))
feature_importances_nonPSD = feature_importances_alltc[feature_importances_alltc['Unnamed: 0'].isin(non_PSD_genes)].sort_values(by="0",ascending=False).reset_index(drop=True)
importances_allgenes_psd= feature_importances_psd["0"]
importances_allgenes_alltc= feature_importances_alltc["0"]


### Random forest model with non-PSD genes 

In [None]:
#non_PSD_genes = list(set(list(rna2.columns)) - set(list(rna.columns)))

#rand_nonpsd_genes = sample(non_PSD_genes, 4000)
nonpsd_rand = rna2.loc[:,rna2.columns.isin(non_PSD_genes)]

X_train,X_test,y_train,y_test = train_test_split(nonpsd_rand, labels2,random_state=24,train_size=0.8)


In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)


## Random Forest models with subsets of genes

### Random Forest model with 500 more important genes of the PSD

In [None]:
psd_importants = rna.loc[:,rna.columns.isin(feature_importances_psd["Unnamed: 0"][0:500])]

X_train,X_test,y_train,y_test = train_test_split(psd_importants, labels1,random_state=24,train_size=0.8)

In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### PSD 1000 + important

In [None]:
psd_importants = rna.loc[:,rna.columns.isin(feature_importances_psd["Unnamed: 0"][0:1000])]

X_train,X_test,y_train,y_test = train_test_split(psd_importants, labels1,random_state=24,train_size=0.8)

In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### NO-PSD RANDOM 4074

In [None]:
non_PSD_genes = list(set(list(rna2.columns)) - set(list(rna.columns)))

rand_nonpsd_genes = sample(non_PSD_genes, 4074)

nonpsd_rand = rna2.loc[:,rna2.columns.isin(rand_nonpsd_genes)]

X_train,X_test,y_train,y_test = train_test_split(nonpsd_rand, labels2,random_state=24,train_size=0.8)


In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### Bottom 1000 alltc

In [None]:
bottom1000_genes = rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][-1000:])]

X_train,X_test,y_train,y_test = train_test_split(bottom1000_genes, labels2,random_state=24,train_size=0.8)


In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### Bottom 10000

In [None]:
bottom10000_genes = rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][-10000:])]

X_train,X_test,y_train,y_test = train_test_split(bottom10000_genes, labels2,random_state=24,train_size=0.8)


In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### Bottom 20000

In [None]:
bottom20000_genes = rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][-20000:])]

X_train,X_test,y_train,y_test = train_test_split(bottom20000_genes, labels2,random_state=24,train_size=0.8)

In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### Bottom 25000

In [None]:
bottom25000_genes = rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][-25000:])]

X_train,X_test,y_train,y_test = train_test_split(bottom25000_genes, labels2,random_state=24,train_size=0.8)

In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### ALL TRANSCRIPTOME 1000 + IMPORTANT

In [None]:
alltc_importants = rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][0:1000])]

X_train,X_test,y_train,y_test = train_test_split(alltc_importants, labels2,random_state=24,train_size=0.8)

In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### ALL TRANSCRIPTOME 500 + IMPORTANTS

In [None]:
alltc_importants = rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][0:500])]

X_train,X_test,y_train,y_test = train_test_split(alltc_importants, labels2,random_state=24,train_size=0.8)

In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### ALL TRANSCRIPTOME 100 + IMPORTANTS

In [None]:
alltc_importants = rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][0:100])]

X_train,X_test,y_train,y_test = train_test_split(alltc_importants, labels2,random_state=24,train_size=0.8)

In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### Top 10 ALL TRANSCRIPTOME

In [None]:
alltc_importants = rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][0:10])]

X_train,X_test,y_train,y_test = train_test_split(alltc_importants, labels2,random_state=24,train_size=0.8)

In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### PSD  TOP 100

In [None]:
psd_importants = rna.loc[:,rna.columns.isin(feature_importances_psd["Unnamed: 0"][0:100])]

X_train,X_test,y_train,y_test = train_test_split(psd_importants, labels1,random_state=24,train_size=0.8)

In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

### PSD TOP 10

In [None]:
psd_importants = rna.loc[:,rna.columns.isin(feature_importances_psd["Unnamed: 0"][0:10])]

X_train,X_test,y_train,y_test = train_test_split(psd_importants, labels1,random_state=24,train_size=0.8)

In [None]:
model_allgenes =  RandomForestClassifier(n_estimators=200, max_depth=12, random_state=24)
model_allgenes.fit(X_train, y_train)
y_pred_train = model_allgenes.predict(X_train)
y_pred_test = model_allgenes.predict(X_test)

label = sorted(list(set(y_train)))

accuracy_train = accuracy_score(y_train, y_pred_train)
conf_mat_train = confusion_matrix(y_train, y_pred_train, labels= label)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_mat_test = confusion_matrix(y_test, y_pred_test, labels= label)

confusion_matrix_norm = conf_mat_test.astype('float') / conf_mat_test.sum(axis=1)[:, np.newaxis]

print("Train Set")
print("Accuracy: ", accuracy_train)
print(conf_mat_train)
print()
print("Test Set")
print("Accuracy: ", accuracy_test)
print(confusion_matrix_norm)

## PSD genes vs non-PSD genes

### Top 1000 PSD vs no-PSD

In [None]:
importants_1000 = feature_importances_alltc['Unnamed: 0'][0:1000]
y = np.array([sum(rna.columns.isin(importants_1000)), 1000 - sum(rna.columns.isin(importants_1000))])
mylabels = ["PSD", "Non_PSD"]

fig, ax = plt.subplots( nrows=1, ncols=1 )  # create figure & 1 axis
ax.pie(y, labels = mylabels)
fig.savefig('PSDvsNonPSD_pie.svg')   # save the figure to file
plt.show()
plt.close(fig) 

In [None]:
topgenes= feature_importances_alltc["Unnamed: 0"][:1000]

psd_prots= pd.read_csv("../dani/Arxius/scrna/psd_proteins.txt")
psd_prots_l=list(psd_prots["Reference Proteome Distler PSDII & Reig-Viader (4301)"].values)

n_psd_genes_present = 4116
total_genes = len(feature_importances_alltc)
no_psd_genes = total_genes - n_psd_genes_present

n_psdprots_important=0
for val in topgenes:
    if val in psd_prots_l:
        n_psdprots_important+=1
n_nopsdgenes_important= len(topgenes) - n_psdprots_important
n_psdprots_important

In [None]:
cumsum_importances=0
l_cumsum=[]
cumsum_psdimportances=0
cumsum_nopsdimportances=0
lpsd_cumsum=[]
lnopsd_cumsum=[]
totaln= 10000
#totaln= len(feature_importances) 

for i in range(totaln):
    cumsum_importances += feature_importances_alltc["0"][i]
    genename= feature_importances_alltc["Unnamed: 0"][i]
    
    if genename in psd_prots_l:
        cumsum_psdimportances+= feature_importances_alltc["0"][i]
    else:
        cumsum_nopsdimportances+= feature_importances_alltc["0"][i]
        
    l_cumsum.append(cumsum_importances)
    lpsd_cumsum.append(cumsum_psdimportances)
    lnopsd_cumsum.append(cumsum_nopsdimportances)

fig, ax = plt.subplots( nrows=1, ncols=1 )  # create figure & 1 axis
ax.plot(list(range(1,totaln+1)), l_cumsum, color="b", label="Total")
#ax.plot(list(range(1,totaln+1)), lpsd_cumsum, color="g", label="Genes from PSD")
#ax.plot(list(range(1,totaln+1)), lnopsd_cumsum, color="y", label="non-PSD genes" )
#ax.legend()
ax.set_xlabel("Number of genes")
ax.set_ylabel("Feature importance")
fig.savefig('../feats_importancesRF_ALLGENES_only2000genes.svg')   # save the figure to file
#plt.close(fig) 
plt.show()

Indica que el número óptimo de genes para definir las diferentes regiones son 1000.

In [None]:
print("suma importancias primeros 750 genes (1%)")
print(sum(importances_allgenes_alltc[0:30000]))
print("suma importancias resto de genes (1%)")
print(sum(importances_allgenes_alltc[30000:]))


# Chi Squared tests

### Chi Squared PSD vs noPSD

In [56]:
psd_genes = feature_importances_psd["Unnamed: 0"]

num_psdgenes_present = len(psd_genes)
num_psdgenes_important= len(list(psd_genes[psd_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))

num_psdgenes_not_important= num_psdgenes_present - num_psdgenes_important

rest_important = 1000 - num_psdgenes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_psdgenes_not_important)

table = [[num_psdgenes_important, num_psdgenes_not_important],[rest_important, rest_not_important ]]
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print(expected)
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.40f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')
    
    
#final_table= [table[0][0],expected[0][0], ]

[[518, 3556], [482, 26497]]
dof=1
[[  131.19505362  3942.80494638]
 [  868.80494638 26110.19505362]]
probability=0.950, critical=3.841, stat=1352.809
significance=0.050, p=0.0000000000000000000000000000000000000000
Dependent (reject H0)


In [53]:
psd_genes = feature_importances_psd["Unnamed: 0"]

num_psdgenes_present = len(psd_genes)
num_psdgenes_important= len(list(psd_genes[psd_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))

num_psdgenes_not_important= num_psdgenes_present - num_psdgenes_important

rest_important = 1000 - num_psdgenes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_psdgenes_not_important)

table = [[num_psdgenes_important, num_psdgenes_not_important],[rest_important, rest_not_important ]]
print(table)

stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)


[[518, 3556], [482, 26497]]
8.00788218381416 1.3022186923268363e-197


### Genes PSD enriched and MAGUK 

In [None]:
customset_prots= pd.read_csv("../DATASETS_PSD_EN_MAGUK.csv", sep=";")


### PSD enriched: Chi square

In [None]:
psd_enriched= customset_prots["PSD_Enriched"].dropna()


num_psdenriched_present = len(list(psd_enriched[psd_enriched.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_psdenriched_important= len(list(psd_enriched[psd_enriched.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_psdenriched_not_important= num_psdenriched_present - num_psdenriched_important

rest_important = 1000 - num_psdenriched_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_psdenriched_not_important)

table = [[num_psdenriched_important, num_psdenriched_not_important],[rest_important, rest_not_important ]]

print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print(expected)
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.23f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')

In [52]:
psd_enriched= customset_prots["PSD_Enriched"].dropna()


num_psdenriched_present = len(list(psd_enriched[psd_enriched.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_psdenriched_important= len(list(psd_enriched[psd_enriched.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_psdenriched_not_important= num_psdenriched_present - num_psdenriched_important

rest_important = 1000 - num_psdenriched_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_psdenriched_not_important)

table = [[num_psdenriched_important, num_psdenriched_not_important],[rest_important, rest_not_important ]]

print(table)
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

[[119, 628], [881, 29425]]
6.328894857465098 2.1593296570667785e-48


### MAGUK complex: Chi Square

In [None]:
maguk_prots= customset_prots["MAGUK COMPLEX"].dropna()

num_maguk_genes_present = len(list(maguk_prots[maguk_prots.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_maguk_genes_important= len(list(maguk_prots[maguk_prots.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_maguk_genes_not_important= num_maguk_genes_present - num_maguk_genes_important

rest_important = 1000 - num_maguk_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_maguk_genes_not_important)

table = [[num_maguk_genes_important, num_maguk_genes_not_important],[rest_important, rest_not_important ]]

print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print(expected)
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.23f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')

In [54]:
maguk_prots= customset_prots["MAGUK COMPLEX"].dropna()

num_maguk_genes_present = len(list(maguk_prots[maguk_prots.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_maguk_genes_important= len(list(maguk_prots[maguk_prots.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_maguk_genes_not_important= num_maguk_genes_present - num_maguk_genes_important

rest_important = 1000 - num_maguk_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_maguk_genes_not_important)

table = [[num_maguk_genes_important, num_maguk_genes_not_important],[rest_important, rest_not_important ]]

print(table)

stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

[[59, 170], [941, 29883]]
11.021422766768769 3.273209330945839e-36


### non-coding DNA

In [57]:
NC_genes = pd.Series(list(set(pd.read_csv('non_coding_genes.txt', sep=',', header=0)["x"])))
num_NC_genes_present = len(list(NC_genes[NC_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_NC_genes_important= len(list(NC_genes[NC_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_NC_genes_not_important= num_NC_genes_present - num_NC_genes_important

rest_important = 1000 - num_NC_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_NC_genes_not_important)

table = [[num_NC_genes_important, num_NC_genes_not_important],[rest_important, rest_not_important ]]
print(table)
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

[[28, 8500], [972, 21553]]
0.07304333091261196 1.0


In [59]:
NC_genes = pd.Series(list(set(pd.read_csv('non_coding_genes.txt', sep=',', header=0)["x"])))
num_NC_genes_present = len(list(NC_genes[NC_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_NC_genes_important= len(list(NC_genes[NC_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_NC_genes_not_important= num_NC_genes_present - num_NC_genes_important

rest_important = 1000 - num_NC_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_NC_genes_not_important)

table = [[num_NC_genes_important, num_NC_genes_not_important],[rest_important, rest_not_important ]]
print(table)

stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print(expected)
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.23f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')


[[28, 8500], [972, 21553]]
dof=1
[[  274.62725019  8253.37274981]
 [  725.37274981 21799.62725019]]
probability=0.950, critical=3.841, stat=314.217
significance=0.050, p=0.00000000000000000000000
Dependent (reject H0)


### Coding DNA

In [None]:
NC_genes = pd.Series(list(set(pd.read_csv('coding_genes.txt', sep=',', header=0)["x"])))
num_NC_genes_present = len(list(NC_genes[NC_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_NC_genes_important= len(list(NC_genes[NC_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_NC_genes_not_important= num_NC_genes_present - num_NC_genes_important

rest_important = 1000 - num_NC_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_NC_genes_not_important)

table = [[num_NC_genes_important, num_NC_genes_not_important],[rest_important, rest_not_important ]]
print(table)

stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print(expected)
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.23f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')


In [58]:
NC_genes = pd.Series(list(set(pd.read_csv('coding_genes.txt', sep=',', header=0)["x"])))
num_NC_genes_present = len(list(NC_genes[NC_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_NC_genes_important= len(list(NC_genes[NC_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_NC_genes_not_important= num_NC_genes_present - num_NC_genes_important

rest_important = 1000 - num_NC_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_NC_genes_not_important)

table = [[num_NC_genes_important, num_NC_genes_not_important],[rest_important, rest_not_important ]]
print(table)
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

[[952, 19998], [48, 10055]]
9.972205553888722 6.535309273689526e-107


### SYNGO 

In [None]:
NC_genes = pd.Series(list(set(pd.read_csv('SYNGO_GENES.txt', sep='\t', header=0)["hgnc_symbol"])))



num_NC_genes_present = len(list(NC_genes[NC_genes.isin(x.upper() for x in feature_importances_alltc["Unnamed: 0"])].values))

num_NC_genes_important= len(list(NC_genes[NC_genes.isin(x.upper() for x in feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_NC_genes_not_important= num_NC_genes_present - num_NC_genes_important

rest_important = 1000 - num_NC_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_NC_genes_not_important)

table = [[num_NC_genes_important, num_NC_genes_not_important],[rest_important, rest_not_important ]]

print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print(expected)
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.23f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')


In [55]:
NC_genes = pd.Series(list(set(pd.read_csv('SYNGO_GENES.txt', sep='\t', header=0)["hgnc_symbol"])))

num_NC_genes_present = len(list(NC_genes[NC_genes.isin(x.upper() for x in feature_importances_alltc["Unnamed: 0"])].values))

num_NC_genes_important= len(list(NC_genes[NC_genes.isin(x.upper() for x in feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_NC_genes_not_important= num_NC_genes_present - num_NC_genes_important

rest_important = 1000 - num_NC_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_NC_genes_not_important)

table = [[num_NC_genes_important, num_NC_genes_not_important],[rest_important, rest_not_important ]]
print(table)
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

[[292, 913], [708, 29140]]
13.163408642273254 1.642174971725796e-177


## Chi-Squared for GO terms

### GO:0007155 Cell Adhesion

In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO0007155.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
print(table)
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

### GO:0007158 Neuron cell-cell adhesion



In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO0007158.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)


### GO:0045785 Positive regulation of cell adhesion

In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO0045785.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

### GO:0001764 Neuron Migration

In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO0001764.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

### GO:0021954 Central nervous system neuron development

In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO0021954.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

### GO:1990138 Neuron projection extension

In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO1990138.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))

num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
print(table)
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

### GO:0030154 Cell Differentiation

In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO0030154.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))
num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
print(table)
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

### GO:0008134 transcription factor binding

In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO0008134.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))
num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

### GO:0010467 gene expression

In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO0010467.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))
num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

### GO:0010468 regulation of gene expression

In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO0010468.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))
num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
print(table)
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

### GO:0061621 canonical glycolysis

In [None]:
GO_genes = pd.Series(list(set(pd.read_csv('GO0061621.txt', sep='\t', header=0)["SYMBOL"])))

num_GO_genes_present = len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"])].values))
num_GO_genes_important= len(list(GO_genes[GO_genes.isin(feature_importances_alltc["Unnamed: 0"][0:1000])].values))
num_GO_genes_not_important= num_GO_genes_present - num_GO_genes_important

rest_important = 1000 - num_GO_genes_important
rest_not_important = len(feature_importances_alltc) - (1000 + num_GO_genes_not_important)

table = [[num_GO_genes_important, num_GO_genes_not_important],[rest_important, rest_not_important ]]
print(table)
stat, p = fisher_exact(table, alternative ="greater")
print(stat,p)

# UMAP representation

In [None]:
print(len(list(set(list(rna2.columns)) - set(list(rna.columns)))))
label = sorted(list(set(labels1)))


### UMAP PSD genes

In [None]:
rna_umap = UMAP(random_state=24, n_neighbors=15, min_dist= 0.1)

# Fit UMAP and extract latent vars 1-2
embedding_psd_a = pd.DataFrame(rna_umap.fit_transform(rna), columns = ['UMAP1','UMAP2'])


# Produce sns.scatterplot and pass metadata.subclasses as color
sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_psd_a, hue=labels1.values, hue_order=label,alpha=.1, linewidth=0, s=1)

# Adjust legend
sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
plt.show()
sns_plot.figure.savefig('umap_PSD_all.svg', bbox_inches='tight')

### Undersampling representation

In [None]:
rus = RandomUnderSampler(random_state=24)

X_resampled, y_resampled = rus.fit_resample(rna, labels1)

print(X_resampled)

rna_umap = UMAP(random_state=24, n_neighbors=15, min_dist= 0.01)

# Fit UMAP and extract latent vars 1-2
embedding_psd_a = pd.DataFrame(rna_umap.fit_transform(X_resampled), columns = ['UMAP1','UMAP2'])
#print(label)

# Produce sns.scatterplot and pass metadata.subclasses as color
#sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_psd_a, hue=y_resampled, hue_order=label,alpha=.1, linewidth=0, s=1)
# Adjust legend
#sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
fig, ax = plt.subplots()
embedding_psd_a.plot(kind='scatter', x='UMAP1', y='UMAP2', c= pd.factorize(y_resampled.values)[0].astype(np.uint16), cmap='Accent_r', figsize=(16,10), ax=ax)
ax.legend(label)
plt.show()

### HDBSCAN and Kmeans for prediction from UMAP

In [None]:
# predicted_umap_psd_all = HDBSCAN.hdbscan(min_samples=10, min_cluster_size=500).fit_predict(embedding_psd_a)
#dbscan= HDBSCAN(min_cluster_size=100, min_samples=100)


#Entrenamos y predecimos
#___________________________________________________________
#preds = dbscan.fit_predict(embedding_psd_a)


#Métricas de Clustering
#___________________________________________________________


#silhouette_score(embedding_psd_a, preds)

#calinski_harabasz_score(embedding_psd_a, preds)

#Hacemos un plot de los resultados

#embedding_psd_a.plot(kind='scatter', x='UMAP1', y='UMAP2', c=rna['Rab6a'], cmap = 'magma',figsize=(16,10))
embedding_psd_a.plot(kind='scatter', x='UMAP1', y='UMAP2', c=preds, cmap='Accent_r', figsize=(16,10))

plt.show()


In [None]:
def centeroidnp(embed, labels, categories):
    centroids_final = []
    arr = embed[['UMAP1', 'UMAP2']].to_numpy()
    for cat in categories:
        indices = [i for i, x in enumerate(labels) if x == cat]
        subset = arr[indices]
        length, dim = subset.shape
        centroids_final.append(np.array([np.sum(subset[:, i])/length for i in range(dim)]))
    return np.array(centroids_final)

In [None]:
start_centroids = centeroidnp(X_resampled, y_resampled, label)
centroids= KMeans(n_clusters=8, init=start_centroids, n_init=1)

preds= centroids.fit_predict(embedding_psd_a)
embedding_psd_a.plot(kind='scatter', x='UMAP1', y='UMAP2', c=preds, cmap='Accent_r', figsize=(16,10))
plt.show()

### UMAP for PSD genes importants

In [None]:
psd_importants = rna.loc[:,rna.columns.isin(feature_importances_psd["Unnamed: 0"][0:100])]
rna_umap = UMAP(random_state=24, n_neighbors=15, min_dist=0.1)

# Fit UMAP and extract latent vars 1-2
embedding_psd_i = pd.DataFrame(rna_umap.fit_transform(psd_importants), columns = ['UMAP1','UMAP2'])

# Produce sns.scatterplot and pass metadata.subclasses as color
sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_psd_i,
                hue=labels1.values, hue_order=label,
                alpha=.1, linewidth=0, s=1)
# Adjust legend
sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
plt.show()
sns_plot.figure.savefig('umap_PSD_importants_100.svg', bbox_inches='tight')

In [None]:
# predicted_umap_psd_all = HDBSCAN.hdbscan(min_samples=10, min_cluster_size=500).fit_predict(embedding_psd_a)
dbscan= HDBSCAN(min_cluster_size=100, min_samples=100)


#Entrenamos y predecimos
#___________________________________________________________
preds = dbscan.fit_predict(embedding_psd_i)


#Métricas de Clustering
#___________________________________________________________


#silhouette_score(embedding_psd_a, preds)

#calinski_harabasz_score(embedding_psd_a, preds)

#Hacemos un plot de los resultados

embedding_psd_i.plot(kind='scatter', x='UMAP1', y='UMAP2', c=preds, cmap='Accent_r', figsize=(16,10))
plt.show()


In [None]:
start_centroids = centeroidnp(embedding_psd_i, labels1, label[:2]+label[3:])
centroids= KMeans(n_clusters=7, init=start_centroids, n_init=1)

preds= centroids.fit_predict(embedding_psd_i)
embedding_psd_i.plot(kind='scatter', x='UMAP1', y='UMAP2', c=preds, cmap='Accent_r', figsize=(16,10))
plt.show()

### UMAP random PSD genes

In [None]:
rand_psd_genes = sample(list(rna.columns), 100)
random100 = rna.loc[:,rna.columns.isin(rand_psd_genes)]

rna_umap = UMAP(random_state=24, n_neighbors=15, min_dist=0.1)

# Fit UMAP and extract latent vars 1-2
embedding_psd_r = pd.DataFrame(rna_umap.fit_transform(random100), columns = ['UMAP1','UMAP2'])

# Produce sns.scatterplot and pass metadata.subclasses as color
sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_psd_r,
                hue=labels1.values, hue_order=label,
                alpha=.1, linewidth=0, s=1)
# Adjust legend
sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
plt.show()
sns_plot.figure.savefig('umap_PSD_rand100.svg', bbox_inches='tight')

In [None]:
# predicted_umap_psd_all = HDBSCAN.hdbscan(min_samples=10, min_cluster_size=500).fit_predict(embedding_psd_a)
dbscan= HDBSCAN(min_cluster_size=100, min_samples=100)


#Entrenamos y predecimos
#___________________________________________________________
preds = dbscan.fit_predict(embedding_psd_r)


#Métricas de Clustering
#___________________________________________________________

#silhouette_score(embedding_psd_a, preds)

#calinski_harabasz_score(embedding_psd_a, preds)

#Hacemos un plot de los resultados
embedding_psd_r.plot(kind='scatter', x='UMAP1', y='UMAP2', c=preds, cmap='Accent_r', figsize=(16,10))
plt.show()

In [None]:
start_centroids = centeroidnp(embedding_psd_r, labels1, label[:2]+label[3:])
centroids= KMeans(n_clusters=7, init=start_centroids, n_init=1)

preds= centroids.fit_predict(embedding_psd_r)
embedding_psd_r.plot(kind='scatter', x='UMAP1', y='UMAP2', c=preds, cmap='Accent_r', figsize=(16,10))
plt.show()

## UMAPs for ALL transcriptome and non-PSD

### All genes from transcriptome

In [None]:
rna_umap = UMAP(random_state=24, n_neighbors=15, min_dist=0.1)

# Fit UMAP and extract latent vars 1-2
embedding_alltc_a = pd.DataFrame(rna_umap.fit_transform(rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][0:30000])]), columns = ['UMAP1','UMAP2'])

# Produce sns.scatterplot and pass metadata.subclasses as color
sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_alltc_a,
                hue=labels2.values, hue_order=label,
                alpha=.1, linewidth=0, s=1)
# Adjust legend
sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
plt.show()
sns_plot.figure.savefig('umap_ALLTC_top30000.svg', bbox_inches='tight')

### UMAP 1000 less importants (bottom)

In [None]:
rna_umap = UMAP(random_state=24, n_neighbors=15, min_dist=0.1)

# Fit UMAP and extract latent vars 1-2
embedding_alltc_a = pd.DataFrame(rna_umap.fit_transform(rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][-1000:])]), columns = ['UMAP1','UMAP2'])

# Produce sns.scatterplot and pass metadata.subclasses as color
sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_alltc_a,
                hue=labels2.values, hue_order=label,
                alpha=.1, linewidth=0, s=1)
# Adjust legend
sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
plt.show()
sns_plot.figure.savefig('umap_ALLTC_bottom1000.svg', bbox_inches='tight')

In [None]:
dbscan= HDBSCAN(min_cluster_size=100, min_samples=100)


#Entrenamos y predecimos
#___________________________________________________________
preds = dbscan.fit_predict(embedding_alltc_a)


#Métricas de Clustering
#___________________________________________________________

#silhouette_score(embedding_psd_a, preds)

#calinski_harabasz_score(embedding_psd_a, preds)

#Hacemos un plot de los resultados
embedding_alltc_a.plot(kind='scatter', x='UMAP1', y='UMAP2', c=preds, cmap='Accent_r', figsize=(16,10))
plt.show()

### UMAP ALLTC importants

In [None]:
alltc_importants = rna2.loc[:,rna2.columns.isin(feature_importances_alltc["Unnamed: 0"][0:100])]

rna_umap = UMAP(random_state=24, n_neighbors=15, min_dist=.1)

# Fit UMAP and extract latent vars 1-2
embedding_alltc_i = pd.DataFrame(rna_umap.fit_transform(alltc_importants), columns = ['UMAP1','UMAP2'])

# Produce sns.scatterplot and pass metadata.subclasses as color
sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_alltc_i,
                hue=labels2.values, hue_order=label,
                alpha=.1, linewidth=0, s=1)
# Adjust legend
sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
sns_plot.figure.savefig('umap_ALLTC_top100.svg', bbox_inches='tight')
plt.show()


### UMAP non-PSD all

In [None]:
#rand_nonpsd_genes = sample(non_PSD_genes, 4000)

nonpsd_rand = rna2.loc[:,rna2.columns.isin(non_PSD_genes)]

rna_umap = UMAP(random_state=24, n_neighbors=15, min_dist=.1)

# Fit UMAP and extract latent vars 1-2
embedding_nonpsd_r = pd.DataFrame(rna_umap.fit_transform(nonpsd_rand), columns = ['UMAP1','UMAP2'])

# Produce sns.scatterplot and pass metadata.subclasses as color
sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_nonpsd_r,
                hue=labels2, hue_order=label,
                alpha=.1, linewidth=0, s=1)
# Adjust legend
sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
plt.show()
sns_plot.figure.savefig('umap_nonPSD_all_20220718.svg', bbox_inches='tight')

### UMAP non-PSD 4074 random

In [None]:
rand_nonpsd_genes = sample(non_PSD_genes, 4074)

nonpsd_rand = rna2.loc[:,rna2.columns.isin(rand_nonpsd_genes)]

rna_umap = UMAP(random_state=24, n_neighbors=15, min_dist=.1)

# Fit UMAP and extract latent vars 1-2
embedding_nonpsd_r = pd.DataFrame(rna_umap.fit_transform(nonpsd_rand), columns = ['UMAP1','UMAP2'])

# Produce sns.scatterplot and pass metadata.subclasses as color
sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_nonpsd_r,
                hue=labels2, hue_order=label,
                alpha=.1, linewidth=0, s=1)
# Adjust legend
sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
plt.show()
sns_plot.figure.savefig('umap_nonPSD_rand4074.svg', bbox_inches='tight')

### UMAP non-PSD top 100

In [None]:
nonpsd_rand = rna2.loc[:,rna2.columns.isin(feature_importances_nonPSD["Unnamed: 0"][0:100])]

rna_umap = UMAP(random_state=24, n_neighbors=15, min_dist=.1)

# Fit UMAP and extract latent vars 1-2
embedding_nonpsd_r = pd.DataFrame(rna_umap.fit_transform(nonpsd_rand), columns = ['UMAP1','UMAP2'])

# Produce sns.scatterplot and pass metadata.subclasses as color
sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_nonpsd_r,
                hue=labels2.values, hue_order=label,
                alpha=.1, linewidth=0, s=1)
# Adjust legend
sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
plt.show()
sns_plot.figure.savefig('umap_nonPSD_top100.svg', bbox_inches='tight')

### UMAP nonPSD random 100

In [None]:
rand_nonpsd_genes = sample(non_PSD_genes, 100)
nonpsd_rand = rna2.loc[:,rna2.columns.isin(rand_nonpsd_genes)]


rna_umap = UMAP(random_state=24, n_neighbors=30, min_dist=.1)

# Fit UMAP and extract latent vars 1-2
embedding_nonpsd_r = pd.DataFrame(rna_umap.fit_transform(nonpsd_rand), columns = ['UMAP1','UMAP2'])

# Produce sns.scatterplot and pass metadata.subclasses as color
sns_plot = sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_nonpsd_r,
                hue=labels2, hue_order=label,
                alpha=.1, linewidth=0, s=1)
# Adjust legend
sns_plot.legend(loc='center left', bbox_to_anchor=(1, .5))
# Save PNG
plt.show()
sns_plot.figure.savefig('umap_nonPSD_100.svg', bbox_inches='tight')