diff --git a/RNAseq_analysis/MTG_revision_V4_publication.py b/RNAseq_analysis/MTG_revision_V4_publication.py new file mode 100644 index 0000000..4e254b9 --- /dev/null +++ b/RNAseq_analysis/MTG_revision_V4_publication.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Mar 6 10:43:50 2023 +This script is used to analysze and plot subsets of the AIBS RNA-seq database (https://portal.brain-map.org/atlases-and-data/rnaseq). +MTG - SMART-SEQ (2019) seperate matrices (exons and introns) are used to sub-select expression values +for specific genes of interest per transcriptomic type within MTG. + +Script structure: +17 - 33 selecting metadata +35 - 60 load and obtain data for genesets of interest, including CPM+1 log10 transformation and saving the data +61 - 84 sturcturing data for further plotting +85 - 108 plotting +109 - end Statistics +@authors: Stan Driessens & Anna Galakhova +""" + +#load packages +import pandas as pd +import seaborn as sns +import numpy as np +import matplotlib.pyplot as plt + + +#read and import data for MTG RNA-seq data +#import metadata +metadata = pd.read_csv(r'path/human_MTG_2018-06-14_samples-columns.csv') +metadata_cleared = metadata.set_index('sample_name') +#%%make data set per geneset +gene_selection = pd.read_csv(r'path/geneset.txt', + names=['gene'], header=None , quotechar="'") + +usecols = list(samples_metadata_cleared.sample_name) +usecols.append('Unnamed: 0') + +#load in reads from exon and intron +exons = pd.read_csv(r"path/human_MTG_2018-06-14_exon-matrix.csv", + usecols=usecols, index_col = 0) +introns = pd.read_csv(r"path/human_MTG_2018-06-14_exon-matrix.csv", + usecols=usecols, index_col = 0) +#add exon + intron reads +data = exons + introns +#transpose the data to make genes columns +data_T = data.T +#create a column with total reads per cell +data_T['total_reads'] = data_T.sum(axis=1) +#divide all reads by total reads +data_T_c = data_T.div(data_T['total_reads'], axis=0) +#multiply by 1e6 to get 'counts per million' +data_T_cpm = data_T_c*1e6 +#add 1 to prevent inf values when log transforming +data_T_cpm1 = data_T_cpm+1 +#transorm 10 log +data_log10cpm = np.log10(data_T_cpm1) +#check if the correct genes are in the dataframe +selection_i = gene_metadata[gene_metadata.gene.isin(gene_selection.gene)] +selection = list(selection_i.entrez_id) +#subselect the data to only contain genes of interest +output = data_log10cpm[selection] +#%% save the pre made data +output.to_csv(r'path/my_data.csv') +#%%load pre made data set +data = pd.read_csv(r'path/my_data.csv', index_col = 'Unnamed: 0') +#get an average expresssion value for each cell +data_cpm = data.mean(axis=1) +data_cpm = data_cpm.rename('avg') +#add average data to metadata creating one big long format Sort = True to allign to correct index +final_data = pd.concat([metadata_cleared, data_cpm], axis = 1, sort=True) +#only select layer 2/3 from the data +final_data_23 = final_data.loc[(final_data['brain_subregion'] == 'L2') | (final_data['brain_subregion'] == 'L3')] +#select the t-types of interest +final_data_23_types = final_data_23[(final_data_23['cluster'].str.contains('FREM3')) | (final_data_23['cluster'].str.contains('GLP2')) + | (final_data_23['cluster'].str.contains('LTK')) | (final_data_23['cluster'].str.contains('CARM1P1')) + | (final_data_23['cluster'].str.contains('COL22'))] +#subselect frem into deep and superficial +final_data_23_types['cluster_new']=list(map(lambda x: x.split()[3], final_data_23_types.cluster)) +final_data_23_types.loc[(final_data_23_types.cluster_new.values == 'FREM3') & (final_data_23_types.brain_subregion.values == 'L2'), 'cluster_new'] = 'L2 FREM3' +final_data_23_types.loc[(final_data_23_types.cluster_new.values == 'FREM3') & (final_data_23_types.brain_subregion.values == 'L3'), 'cluster_new'] = 'L3 FREM3' + + +#%%create median values for stripplot purpose (per donor) +median_vals = final_data_23_types.groupby(['donor', 'cluster_new'])['avg'].median() +median_df = pd.DataFrame(median_vals) +median_df.reset_index(inplace=True) + +#%%PLOT +sns.set_palette(sns.color_palette('pastel')) + +ax = sns.violinplot(data=final_data_23_types, x='cluster_new', y='avg', legend=None, inner = 'quartile', + order=['LTK', 'L2 FREM3', 'L3 FREM3', 'GLP2R', 'CARM1P1', 'COL22A1'], color='lightgrey') +ax = sns.pointplot(data = median_df, x='cluster_new', y='avg', order=['LTK', 'L2 FREM3', 'L3 FREM3' , 'GLP2R', 'CARM1P1', 'COL22A1'], + hue = 'donor', markers=['o','s','^', 'p'], color='k', join=False, linestyles='-', + dodge=True,scale=1.3, errwidth=0) +ax.set_ylim(0, 1.75) +for l in ax.lines: + l.set_linestyle('--') + l.set_linewidth(0) + l.set_color('red') + l.set_alpha(0.5) +for l in ax.lines[1::3]: + l.set_linestyle('--') + l.set_linewidth(1) + l.set_color('black') + l.set_alpha(0.5) +plt.legend([],[], frameon=False) +ax.set_ylim(0.6, 1.3) +ax.set_ylabel('Mean gene expression log10 (CPM+1)') +#%%save the plot +plt.savefig(r'path/my_plot.eps') +#%%STATISTICS +#%%do statisics for every donor seperately +donors = np.unique(final_data_23_types.donor) +H16_24_010 = final_data_23_types[final_data_23['donor'] == 'H16.24.010'] +H200_1023 = final_data_23_types[final_data_23_types['donor'] == 'H200.1023'] +H200_1025 = final_data_23_types[final_data_23_types['donor'] == 'H200.1025'] +H200_1030 = final_data_23_types[final_data_23_types['donor'] == 'H200.1030'] +#%%do statistics, HERE SELECT WHICH DATAFRAME YOU WANT TO USE DONOR DATA OR COMBINED DATA +#import statistical packages +import scikit_posthocs as sp +import pingouin as pg +#do Kruskal-Wallis and Dunn for single cell data +krus = pg.kruskal(data=df, dv='avg', between='cluster_new', detailed=True) +dunn = pg.pairwise_tests(dv='avg', between='cluster_new', data=df, padjust='holm', parametric=False) +#save the statistics as csv +krus.to_csv(r'path/kruskal_result.csv') +dunn.to_csv(r'path/dunn_result.csv') +#perforam Friedmann test for data per donor +fried = pg.friedman(data=df, dv='avg', between='region', within='donor') +dunn = pg.pairwise_tests(dv='avg', between='region', data=df, padjust='holm', parametric=False) +#save the statistics as csv +fried.to_csv(r'path/friedman_result.csv') +dunn.to_csv(r'path/dunn_fried_result.csv') diff --git a/RNAseq_analysis/RNA_seq_data_normalization.py b/RNAseq_analysis/RNA_seq_data_normalization.py new file mode 100644 index 0000000..8d0b6b0 --- /dev/null +++ b/RNAseq_analysis/RNA_seq_data_normalization.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jul 7 09:43:56 2022 + +proces large matrices of raw counts into log10CPM values +and write to a new matrix + +@author: stand & annagalakhova +""" + +import pandas as pd +import numpy as np +import dask.dataframe as dd +from pathlib import Path +import os + +filepath = Path(r'C/Users/annagalakhova/PhD INF CNCR VU/DATA/transcriptomics/RNAseq/VISp_mouse') + +df = pd.read_csv(r"/Users/annagalakhova/PhD INF CNCR VU/DATA/transcriptomics/RNAseq/VISp_mouse/mouse_VISp_2018-06-14_intron-matrix.csv", chunksize=2000) +filename = "/Users/annagalakhova/PhD INF CNCR VU/DATA/transcriptomics/RNAseq/VISp_mouse/mouse_VISp_2018-06-14_intron-matrix_CPMlog10.csv" +print('iterator made') +c=0 +header = True +for chunk in df: + print(c) + libsize = np.sum(chunk.iloc[:,1:].values, axis=1) + logcpm = np.log10(((chunk.iloc[:,1:].T / libsize).T *1_000_000) + 1) + logcpm = logcpm.astype('float32') #reduce memory from float64 + logcpm.insert(0, 'sample_name', chunk.iloc[:,0]) + if c == 0: + logcpm.to_csv(filename, index = False) + c+=1 + else: + logcpm.to_csv(filename, mode='a', header=False, index = False) + c+=1 + diff --git a/RNAseq_analysis/regions_revision_v3_script_publish.py b/RNAseq_analysis/regions_revision_v3_script_publish.py new file mode 100644 index 0000000..6ed3d34 --- /dev/null +++ b/RNAseq_analysis/regions_revision_v3_script_publish.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 1 10:16:26 2023 + +This script is used to analyze and plot subsets of the AIBS RNA-seq database (https://portal.brain-map.org/atlases-and-data/rnaseq). +MULTIPLE CORTICAL AREAS - SMART-SEQ (2019) full matrix (exons+introns) is used to sub-select expression values +for specific genes of interest per region or cell classes. +Script structure: +24 - 38 selecting the data based on gene set of interest. +39 - 43 create datasets based on gene-set of interest (can be save and used later) +44 - 61 Load in selected data and add values to metadata +62 - 69 Group data by donor, allowing for plots of donordata seperate +70 - 97 plot the data +98 - end statistical evalutation + +note. script on normaliztion of this dataset can be found in FINAL_RNA_seq_data_nromalization.py + +@author: Stan Driessens & Anna Galakhova +""" +#load packages +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +#read and import data from regions +#import metadata +metadata = pd.read_csv(r'path/metadata.csv') +metadata = metadata.set_index('sample_name') +#clear the metadata +metadata_cleared = metadata[metadata['class_label'].notnull()] +#%% create gene-set data from normalized dataframe +geneset = pd.read_csv(r'path/geneset_IQ.txt', names=['gene'], header=None , quotechar="'") +#in case there are duplicates present in the gene set, remove and update the gene set +geneset = geneset.gene.drop_duplicates() +#convert geneset to a dataframe to improve compatibillity with pandas +geneset = pd.DataFrame(geneset) +#%% load and filter data based on gene-set (ONLY NEED TO DO THIS ONCE, THEN SAVE PRE MADE DATA AS .CSV) +# selecting expression data of the genes present in the gene set of interest (geneset) +skiprows_set1 = metadata_cleared.loc[~metadata_cleared.sample_name.isin(metadata_cleared.sample_name)].index+1 +#load in the data, NOTE. here we used pre-normalized log10 CPM data. +data_log10CPM = pd.read_csv(r'path/matrix_log10_CPM.csv', + skiprows=skiprows_set1, usecols=geneset.gene) +#save the pre made data +data_log10CPM.to_csv(r'path/your_data.csv') +#%%load in the pre-made normalized and pre-assigned gene-set data +data = pd.read_csv(r'path/your_data.csv') +#create avg expression and add to metadata, align index on sample_name +data = data.set_index('sample_name') +#drop unnamed 0 +data = data.drop('Unnamed: 0', axis=1) +data['avg'] = data.mean(axis=1) +#select only cpm values +data_cpm = data.avg +#concatenate metadata to avg values from this geneset +b = pd.concat([metadata_cleared, data_cpm], axis = 1 , sort = True) +#create new column for region values where M1 and S1 sub-regions are merged to new M1 and S1 +b.loc[b['region_label'].str.contains('M1'), 'region'] = 'M1' +b.loc[b['region_label'].str.contains('MTG'), 'region'] = 'MTG' +b.loc[b['region_label'].str.contains('A1'), 'region'] = 'A1' +b.loc[b['region_label'].str.contains('CgG'), 'region'] = 'CgG' +b.loc[b['region_label'].str.contains('S1'), 'region'] = 'S1' +b.loc[b['region_label'].str.contains('V1'), 'region'] = 'V1' +#%% Sub-select data per donor (to plot median experssion per donor) +#here, select which cell class to plot (i.e. Glutamatergic, GABAergic or Non-neuronal) +data_plot = b.loc[b['class_label'] == 'Glutamaterigc'] +data_donor = b.loc[b['class_label'] == 'Glutamatergic'] +#assign median expression per donor +median_vals = data_donor.groupby(['region', 'external_donor_name_order', 'class_label'])['avg'].median() +median_df = pd.DataFrame(median_vals) +median_df.reset_index(inplace=True) +#%%plotting the data +#plot a violin for gene expression for all cells +ax = sns.violinplot(data=data_plot, x='class_label', y='avg', legend=None, inner = None, + order=['Glutamatergic', 'GABAergic', 'Non-neuronal'], color='lightgrey', linewidth = 0 ) + + +ax = sns.pointplot(data = median_df, x='class_label', y='avg', order=['Glutamatergic', 'GABAergic', 'Non-neuronal'], + hue = 'external_donor_name_order', markers=['o','s','^'], color='k', join=False, linestyles='--', + dodge=True,scale=1.3, errwidth=0) + +ax.set_ylim((median_df.avg.mean()-0.25), median_df.avg.mean()+0.25) +ax.set_ylabel('Mean gene expression log10 (CPM+1)') + +for l in ax.lines: + l.set_linestyle('--') + l.set_linewidth(0) + l.set_color('red') + l.set_alpha(0.5) +for l in ax.lines[1::3]: + l.set_linestyle('--') + l.set_linewidth(1) + l.set_color('black') + l.set_alpha(0.5) +plt.legend([],[], frameon=False) +#save the plot +ax.set_ylim((median_df.avg.mean()-0.25), median_df.avg.mean()+0.25) +ax.set_ylabel('Mean gene expression log10 (CPM+1)') +plt.savefig(r'path/your_plot.eps') +#%%statistical evalutation +#Friedman test on the separate donors +#split the group in separate donors +#do Friedman test for geneset - class - regions +# get the data per patient +data_donor_1 = b.loc[b['external_donor_name_order'] == 1] +data_donor_2 = b.loc[b['external_donor_name_order'] == 2] +data_donor_3 = b.loc[b['external_donor_name_order'] == 3] +#get_class +#hard code which class to use for statsitics: Glutamatergic, GABAergic or Non-neuronal +data_donor_1_df = data_donor_1.loc[data_donor_1['class_label'] == 'Glutamatergic'] +data_donor_2_df = data_donor_2.loc[data_donor_2['class_label'] == 'Glutamatergic'] +data_donor_3_df = data_donor_3.loc[data_donor_3['class_label'] == 'Glutamatergic'] + +#%%do statistics, HERE SELECT WHICH DATAFRAME YOU WANT TO USE DONOR DATA OR COMBINED DATA +df = data_donor_1_df +#import staistical packages +import scikit_posthocs as sp +import pingouin as pg +#perform Kruskal-Wallis and Dunn tests +krus = pg.kruskal(data=df, dv='avg', between='region', detailed=True) +dunn = pg.pairwise_tests(dv='avg', between='region', data=df, padjust='holm', parametric=False) +#save the statistics as csv +krus.to_csv(r'path/kruskal_result.csv') +dunn.to_csv(r'path/dunn_result.csv') +#perform Friedman test +fried = pg.friedman(data=df, dv='avg', between='region', within='donor') +dunn = pg.pairwise_tests(dv='avg', between='region', data=df, padjust='holm', parametric=False) +#save the statistics as csv +fried.to_csv(r'path/friedman_result.csv') +dunn.to_csv(r'path/dunn_fried_result.csv') +