-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
089335b
commit 2dfd8d3
Showing
3 changed files
with
296 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Mon Mar 6 10:43:50 2023 | ||
This script is used to analysze and plot subsets of the AIBS RNA-seq database (https://portal.brain-map.org/atlases-and-data/rnaseq). | ||
MTG - SMART-SEQ (2019) seperate matrices (exons and introns) are used to sub-select expression values | ||
for specific genes of interest per transcriptomic type within MTG. | ||
Script structure: | ||
17 - 33 selecting metadata | ||
35 - 60 load and obtain data for genesets of interest, including CPM+1 log10 transformation and saving the data | ||
61 - 84 sturcturing data for further plotting | ||
85 - 108 plotting | ||
109 - end Statistics | ||
@authors: Stan Driessens & Anna Galakhova | ||
""" | ||
|
||
#load packages | ||
import pandas as pd | ||
import seaborn as sns | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
#read and import data for MTG RNA-seq data | ||
#import metadata | ||
metadata = pd.read_csv(r'path/human_MTG_2018-06-14_samples-columns.csv') | ||
metadata_cleared = metadata.set_index('sample_name') | ||
#%%make data set per geneset | ||
gene_selection = pd.read_csv(r'path/geneset.txt', | ||
names=['gene'], header=None , quotechar="'") | ||
|
||
usecols = list(samples_metadata_cleared.sample_name) | ||
usecols.append('Unnamed: 0') | ||
|
||
#load in reads from exon and intron | ||
exons = pd.read_csv(r"path/human_MTG_2018-06-14_exon-matrix.csv", | ||
usecols=usecols, index_col = 0) | ||
introns = pd.read_csv(r"path/human_MTG_2018-06-14_exon-matrix.csv", | ||
usecols=usecols, index_col = 0) | ||
#add exon + intron reads | ||
data = exons + introns | ||
#transpose the data to make genes columns | ||
data_T = data.T | ||
#create a column with total reads per cell | ||
data_T['total_reads'] = data_T.sum(axis=1) | ||
#divide all reads by total reads | ||
data_T_c = data_T.div(data_T['total_reads'], axis=0) | ||
#multiply by 1e6 to get 'counts per million' | ||
data_T_cpm = data_T_c*1e6 | ||
#add 1 to prevent inf values when log transforming | ||
data_T_cpm1 = data_T_cpm+1 | ||
#transorm 10 log | ||
data_log10cpm = np.log10(data_T_cpm1) | ||
#check if the correct genes are in the dataframe | ||
selection_i = gene_metadata[gene_metadata.gene.isin(gene_selection.gene)] | ||
selection = list(selection_i.entrez_id) | ||
#subselect the data to only contain genes of interest | ||
output = data_log10cpm[selection] | ||
#%% save the pre made data | ||
output.to_csv(r'path/my_data.csv') | ||
#%%load pre made data set | ||
data = pd.read_csv(r'path/my_data.csv', index_col = 'Unnamed: 0') | ||
#get an average expresssion value for each cell | ||
data_cpm = data.mean(axis=1) | ||
data_cpm = data_cpm.rename('avg') | ||
#add average data to metadata creating one big long format Sort = True to allign to correct index | ||
final_data = pd.concat([metadata_cleared, data_cpm], axis = 1, sort=True) | ||
#only select layer 2/3 from the data | ||
final_data_23 = final_data.loc[(final_data['brain_subregion'] == 'L2') | (final_data['brain_subregion'] == 'L3')] | ||
#select the t-types of interest | ||
final_data_23_types = final_data_23[(final_data_23['cluster'].str.contains('FREM3')) | (final_data_23['cluster'].str.contains('GLP2')) | ||
| (final_data_23['cluster'].str.contains('LTK')) | (final_data_23['cluster'].str.contains('CARM1P1')) | ||
| (final_data_23['cluster'].str.contains('COL22'))] | ||
#subselect frem into deep and superficial | ||
final_data_23_types['cluster_new']=list(map(lambda x: x.split()[3], final_data_23_types.cluster)) | ||
final_data_23_types.loc[(final_data_23_types.cluster_new.values == 'FREM3') & (final_data_23_types.brain_subregion.values == 'L2'), 'cluster_new'] = 'L2 FREM3' | ||
final_data_23_types.loc[(final_data_23_types.cluster_new.values == 'FREM3') & (final_data_23_types.brain_subregion.values == 'L3'), 'cluster_new'] = 'L3 FREM3' | ||
|
||
|
||
#%%create median values for stripplot purpose (per donor) | ||
median_vals = final_data_23_types.groupby(['donor', 'cluster_new'])['avg'].median() | ||
median_df = pd.DataFrame(median_vals) | ||
median_df.reset_index(inplace=True) | ||
|
||
#%%PLOT | ||
sns.set_palette(sns.color_palette('pastel')) | ||
|
||
ax = sns.violinplot(data=final_data_23_types, x='cluster_new', y='avg', legend=None, inner = 'quartile', | ||
order=['LTK', 'L2 FREM3', 'L3 FREM3', 'GLP2R', 'CARM1P1', 'COL22A1'], color='lightgrey') | ||
ax = sns.pointplot(data = median_df, x='cluster_new', y='avg', order=['LTK', 'L2 FREM3', 'L3 FREM3' , 'GLP2R', 'CARM1P1', 'COL22A1'], | ||
hue = 'donor', markers=['o','s','^', 'p'], color='k', join=False, linestyles='-', | ||
dodge=True,scale=1.3, errwidth=0) | ||
ax.set_ylim(0, 1.75) | ||
for l in ax.lines: | ||
l.set_linestyle('--') | ||
l.set_linewidth(0) | ||
l.set_color('red') | ||
l.set_alpha(0.5) | ||
for l in ax.lines[1::3]: | ||
l.set_linestyle('--') | ||
l.set_linewidth(1) | ||
l.set_color('black') | ||
l.set_alpha(0.5) | ||
plt.legend([],[], frameon=False) | ||
ax.set_ylim(0.6, 1.3) | ||
ax.set_ylabel('Mean gene expression log10 (CPM+1)') | ||
#%%save the plot | ||
plt.savefig(r'path/my_plot.eps') | ||
#%%STATISTICS | ||
#%%do statisics for every donor seperately | ||
donors = np.unique(final_data_23_types.donor) | ||
H16_24_010 = final_data_23_types[final_data_23['donor'] == 'H16.24.010'] | ||
H200_1023 = final_data_23_types[final_data_23_types['donor'] == 'H200.1023'] | ||
H200_1025 = final_data_23_types[final_data_23_types['donor'] == 'H200.1025'] | ||
H200_1030 = final_data_23_types[final_data_23_types['donor'] == 'H200.1030'] | ||
#%%do statistics, HERE SELECT WHICH DATAFRAME YOU WANT TO USE DONOR DATA OR COMBINED DATA | ||
#import statistical packages | ||
import scikit_posthocs as sp | ||
import pingouin as pg | ||
#do Kruskal-Wallis and Dunn for single cell data | ||
krus = pg.kruskal(data=df, dv='avg', between='cluster_new', detailed=True) | ||
dunn = pg.pairwise_tests(dv='avg', between='cluster_new', data=df, padjust='holm', parametric=False) | ||
#save the statistics as csv | ||
krus.to_csv(r'path/kruskal_result.csv') | ||
dunn.to_csv(r'path/dunn_result.csv') | ||
#perforam Friedmann test for data per donor | ||
fried = pg.friedman(data=df, dv='avg', between='region', within='donor') | ||
dunn = pg.pairwise_tests(dv='avg', between='region', data=df, padjust='holm', parametric=False) | ||
#save the statistics as csv | ||
fried.to_csv(r'path/friedman_result.csv') | ||
dunn.to_csv(r'path/dunn_fried_result.csv') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Thu Jul 7 09:43:56 2022 | ||
proces large matrices of raw counts into log10CPM values | ||
and write to a new matrix | ||
@author: stand & annagalakhova | ||
""" | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import dask.dataframe as dd | ||
from pathlib import Path | ||
import os | ||
|
||
filepath = Path(r'C/Users/annagalakhova/PhD INF CNCR VU/DATA/transcriptomics/RNAseq/VISp_mouse') | ||
|
||
df = pd.read_csv(r"/Users/annagalakhova/PhD INF CNCR VU/DATA/transcriptomics/RNAseq/VISp_mouse/mouse_VISp_2018-06-14_intron-matrix.csv", chunksize=2000) | ||
filename = "/Users/annagalakhova/PhD INF CNCR VU/DATA/transcriptomics/RNAseq/VISp_mouse/mouse_VISp_2018-06-14_intron-matrix_CPMlog10.csv" | ||
print('iterator made') | ||
c=0 | ||
header = True | ||
for chunk in df: | ||
print(c) | ||
libsize = np.sum(chunk.iloc[:,1:].values, axis=1) | ||
logcpm = np.log10(((chunk.iloc[:,1:].T / libsize).T *1_000_000) + 1) | ||
logcpm = logcpm.astype('float32') #reduce memory from float64 | ||
logcpm.insert(0, 'sample_name', chunk.iloc[:,0]) | ||
if c == 0: | ||
logcpm.to_csv(filename, index = False) | ||
c+=1 | ||
else: | ||
logcpm.to_csv(filename, mode='a', header=False, index = False) | ||
c+=1 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Wed Mar 1 10:16:26 2023 | ||
This script is used to analyze and plot subsets of the AIBS RNA-seq database (https://portal.brain-map.org/atlases-and-data/rnaseq). | ||
MULTIPLE CORTICAL AREAS - SMART-SEQ (2019) full matrix (exons+introns) is used to sub-select expression values | ||
for specific genes of interest per region or cell classes. | ||
Script structure: | ||
24 - 38 selecting the data based on gene set of interest. | ||
39 - 43 create datasets based on gene-set of interest (can be save and used later) | ||
44 - 61 Load in selected data and add values to metadata | ||
62 - 69 Group data by donor, allowing for plots of donordata seperate | ||
70 - 97 plot the data | ||
98 - end statistical evalutation | ||
note. script on normaliztion of this dataset can be found in FINAL_RNA_seq_data_nromalization.py | ||
@author: Stan Driessens & Anna Galakhova | ||
""" | ||
#load packages | ||
import pandas as pd | ||
import seaborn as sns | ||
import matplotlib.pyplot as plt | ||
#read and import data from regions | ||
#import metadata | ||
metadata = pd.read_csv(r'path/metadata.csv') | ||
metadata = metadata.set_index('sample_name') | ||
#clear the metadata | ||
metadata_cleared = metadata[metadata['class_label'].notnull()] | ||
#%% create gene-set data from normalized dataframe | ||
geneset = pd.read_csv(r'path/geneset_IQ.txt', names=['gene'], header=None , quotechar="'") | ||
#in case there are duplicates present in the gene set, remove and update the gene set | ||
geneset = geneset.gene.drop_duplicates() | ||
#convert geneset to a dataframe to improve compatibillity with pandas | ||
geneset = pd.DataFrame(geneset) | ||
#%% load and filter data based on gene-set (ONLY NEED TO DO THIS ONCE, THEN SAVE PRE MADE DATA AS .CSV) | ||
# selecting expression data of the genes present in the gene set of interest (geneset) | ||
skiprows_set1 = metadata_cleared.loc[~metadata_cleared.sample_name.isin(metadata_cleared.sample_name)].index+1 | ||
#load in the data, NOTE. here we used pre-normalized log10 CPM data. | ||
data_log10CPM = pd.read_csv(r'path/matrix_log10_CPM.csv', | ||
skiprows=skiprows_set1, usecols=geneset.gene) | ||
#save the pre made data | ||
data_log10CPM.to_csv(r'path/your_data.csv') | ||
#%%load in the pre-made normalized and pre-assigned gene-set data | ||
data = pd.read_csv(r'path/your_data.csv') | ||
#create avg expression and add to metadata, align index on sample_name | ||
data = data.set_index('sample_name') | ||
#drop unnamed 0 | ||
data = data.drop('Unnamed: 0', axis=1) | ||
data['avg'] = data.mean(axis=1) | ||
#select only cpm values | ||
data_cpm = data.avg | ||
#concatenate metadata to avg values from this geneset | ||
b = pd.concat([metadata_cleared, data_cpm], axis = 1 , sort = True) | ||
#create new column for region values where M1 and S1 sub-regions are merged to new M1 and S1 | ||
b.loc[b['region_label'].str.contains('M1'), 'region'] = 'M1' | ||
b.loc[b['region_label'].str.contains('MTG'), 'region'] = 'MTG' | ||
b.loc[b['region_label'].str.contains('A1'), 'region'] = 'A1' | ||
b.loc[b['region_label'].str.contains('CgG'), 'region'] = 'CgG' | ||
b.loc[b['region_label'].str.contains('S1'), 'region'] = 'S1' | ||
b.loc[b['region_label'].str.contains('V1'), 'region'] = 'V1' | ||
#%% Sub-select data per donor (to plot median experssion per donor) | ||
#here, select which cell class to plot (i.e. Glutamatergic, GABAergic or Non-neuronal) | ||
data_plot = b.loc[b['class_label'] == 'Glutamaterigc'] | ||
data_donor = b.loc[b['class_label'] == 'Glutamatergic'] | ||
#assign median expression per donor | ||
median_vals = data_donor.groupby(['region', 'external_donor_name_order', 'class_label'])['avg'].median() | ||
median_df = pd.DataFrame(median_vals) | ||
median_df.reset_index(inplace=True) | ||
#%%plotting the data | ||
#plot a violin for gene expression for all cells | ||
ax = sns.violinplot(data=data_plot, x='class_label', y='avg', legend=None, inner = None, | ||
order=['Glutamatergic', 'GABAergic', 'Non-neuronal'], color='lightgrey', linewidth = 0 ) | ||
|
||
|
||
ax = sns.pointplot(data = median_df, x='class_label', y='avg', order=['Glutamatergic', 'GABAergic', 'Non-neuronal'], | ||
hue = 'external_donor_name_order', markers=['o','s','^'], color='k', join=False, linestyles='--', | ||
dodge=True,scale=1.3, errwidth=0) | ||
|
||
ax.set_ylim((median_df.avg.mean()-0.25), median_df.avg.mean()+0.25) | ||
ax.set_ylabel('Mean gene expression log10 (CPM+1)') | ||
|
||
for l in ax.lines: | ||
l.set_linestyle('--') | ||
l.set_linewidth(0) | ||
l.set_color('red') | ||
l.set_alpha(0.5) | ||
for l in ax.lines[1::3]: | ||
l.set_linestyle('--') | ||
l.set_linewidth(1) | ||
l.set_color('black') | ||
l.set_alpha(0.5) | ||
plt.legend([],[], frameon=False) | ||
#save the plot | ||
ax.set_ylim((median_df.avg.mean()-0.25), median_df.avg.mean()+0.25) | ||
ax.set_ylabel('Mean gene expression log10 (CPM+1)') | ||
plt.savefig(r'path/your_plot.eps') | ||
#%%statistical evalutation | ||
#Friedman test on the separate donors | ||
#split the group in separate donors | ||
#do Friedman test for geneset - class - regions | ||
# get the data per patient | ||
data_donor_1 = b.loc[b['external_donor_name_order'] == 1] | ||
data_donor_2 = b.loc[b['external_donor_name_order'] == 2] | ||
data_donor_3 = b.loc[b['external_donor_name_order'] == 3] | ||
#get_class | ||
#hard code which class to use for statsitics: Glutamatergic, GABAergic or Non-neuronal | ||
data_donor_1_df = data_donor_1.loc[data_donor_1['class_label'] == 'Glutamatergic'] | ||
data_donor_2_df = data_donor_2.loc[data_donor_2['class_label'] == 'Glutamatergic'] | ||
data_donor_3_df = data_donor_3.loc[data_donor_3['class_label'] == 'Glutamatergic'] | ||
|
||
#%%do statistics, HERE SELECT WHICH DATAFRAME YOU WANT TO USE DONOR DATA OR COMBINED DATA | ||
df = data_donor_1_df | ||
#import staistical packages | ||
import scikit_posthocs as sp | ||
import pingouin as pg | ||
#perform Kruskal-Wallis and Dunn tests | ||
krus = pg.kruskal(data=df, dv='avg', between='region', detailed=True) | ||
dunn = pg.pairwise_tests(dv='avg', between='region', data=df, padjust='holm', parametric=False) | ||
#save the statistics as csv | ||
krus.to_csv(r'path/kruskal_result.csv') | ||
dunn.to_csv(r'path/dunn_result.csv') | ||
#perform Friedman test | ||
fried = pg.friedman(data=df, dv='avg', between='region', within='donor') | ||
dunn = pg.pairwise_tests(dv='avg', between='region', data=df, padjust='holm', parametric=False) | ||
#save the statistics as csv | ||
fried.to_csv(r'path/friedman_result.csv') | ||
dunn.to_csv(r'path/dunn_fried_result.csv') | ||
|