Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
AnnaGalakhova committed Jun 5, 2023
1 parent 089335b commit 2dfd8d3
Show file tree
Hide file tree
Showing 3 changed files with 296 additions and 0 deletions.
131 changes: 131 additions & 0 deletions RNAseq_analysis/MTG_revision_V4_publication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 6 10:43:50 2023
This script is used to analysze and plot subsets of the AIBS RNA-seq database (https://portal.brain-map.org/atlases-and-data/rnaseq).
MTG - SMART-SEQ (2019) seperate matrices (exons and introns) are used to sub-select expression values
for specific genes of interest per transcriptomic type within MTG.
Script structure:
17 - 33 selecting metadata
35 - 60 load and obtain data for genesets of interest, including CPM+1 log10 transformation and saving the data
61 - 84 sturcturing data for further plotting
85 - 108 plotting
109 - end Statistics
@authors: Stan Driessens & Anna Galakhova
"""

#load packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


#read and import data for MTG RNA-seq data
#import metadata
metadata = pd.read_csv(r'path/human_MTG_2018-06-14_samples-columns.csv')
metadata_cleared = metadata.set_index('sample_name')
#%%make data set per geneset
gene_selection = pd.read_csv(r'path/geneset.txt',
names=['gene'], header=None , quotechar="'")

usecols = list(samples_metadata_cleared.sample_name)
usecols.append('Unnamed: 0')

#load in reads from exon and intron
exons = pd.read_csv(r"path/human_MTG_2018-06-14_exon-matrix.csv",
usecols=usecols, index_col = 0)
introns = pd.read_csv(r"path/human_MTG_2018-06-14_exon-matrix.csv",
usecols=usecols, index_col = 0)
#add exon + intron reads
data = exons + introns
#transpose the data to make genes columns
data_T = data.T
#create a column with total reads per cell
data_T['total_reads'] = data_T.sum(axis=1)
#divide all reads by total reads
data_T_c = data_T.div(data_T['total_reads'], axis=0)
#multiply by 1e6 to get 'counts per million'
data_T_cpm = data_T_c*1e6
#add 1 to prevent inf values when log transforming
data_T_cpm1 = data_T_cpm+1
#transorm 10 log
data_log10cpm = np.log10(data_T_cpm1)
#check if the correct genes are in the dataframe
selection_i = gene_metadata[gene_metadata.gene.isin(gene_selection.gene)]
selection = list(selection_i.entrez_id)
#subselect the data to only contain genes of interest
output = data_log10cpm[selection]
#%% save the pre made data
output.to_csv(r'path/my_data.csv')
#%%load pre made data set
data = pd.read_csv(r'path/my_data.csv', index_col = 'Unnamed: 0')
#get an average expresssion value for each cell
data_cpm = data.mean(axis=1)
data_cpm = data_cpm.rename('avg')
#add average data to metadata creating one big long format Sort = True to allign to correct index
final_data = pd.concat([metadata_cleared, data_cpm], axis = 1, sort=True)
#only select layer 2/3 from the data
final_data_23 = final_data.loc[(final_data['brain_subregion'] == 'L2') | (final_data['brain_subregion'] == 'L3')]
#select the t-types of interest
final_data_23_types = final_data_23[(final_data_23['cluster'].str.contains('FREM3')) | (final_data_23['cluster'].str.contains('GLP2'))
| (final_data_23['cluster'].str.contains('LTK')) | (final_data_23['cluster'].str.contains('CARM1P1'))
| (final_data_23['cluster'].str.contains('COL22'))]
#subselect frem into deep and superficial
final_data_23_types['cluster_new']=list(map(lambda x: x.split()[3], final_data_23_types.cluster))
final_data_23_types.loc[(final_data_23_types.cluster_new.values == 'FREM3') & (final_data_23_types.brain_subregion.values == 'L2'), 'cluster_new'] = 'L2 FREM3'
final_data_23_types.loc[(final_data_23_types.cluster_new.values == 'FREM3') & (final_data_23_types.brain_subregion.values == 'L3'), 'cluster_new'] = 'L3 FREM3'


#%%create median values for stripplot purpose (per donor)
median_vals = final_data_23_types.groupby(['donor', 'cluster_new'])['avg'].median()
median_df = pd.DataFrame(median_vals)
median_df.reset_index(inplace=True)

#%%PLOT
sns.set_palette(sns.color_palette('pastel'))

ax = sns.violinplot(data=final_data_23_types, x='cluster_new', y='avg', legend=None, inner = 'quartile',
order=['LTK', 'L2 FREM3', 'L3 FREM3', 'GLP2R', 'CARM1P1', 'COL22A1'], color='lightgrey')
ax = sns.pointplot(data = median_df, x='cluster_new', y='avg', order=['LTK', 'L2 FREM3', 'L3 FREM3' , 'GLP2R', 'CARM1P1', 'COL22A1'],
hue = 'donor', markers=['o','s','^', 'p'], color='k', join=False, linestyles='-',
dodge=True,scale=1.3, errwidth=0)
ax.set_ylim(0, 1.75)
for l in ax.lines:
l.set_linestyle('--')
l.set_linewidth(0)
l.set_color('red')
l.set_alpha(0.5)
for l in ax.lines[1::3]:
l.set_linestyle('--')
l.set_linewidth(1)
l.set_color('black')
l.set_alpha(0.5)
plt.legend([],[], frameon=False)
ax.set_ylim(0.6, 1.3)
ax.set_ylabel('Mean gene expression log10 (CPM+1)')
#%%save the plot
plt.savefig(r'path/my_plot.eps')
#%%STATISTICS
#%%do statisics for every donor seperately
donors = np.unique(final_data_23_types.donor)
H16_24_010 = final_data_23_types[final_data_23['donor'] == 'H16.24.010']
H200_1023 = final_data_23_types[final_data_23_types['donor'] == 'H200.1023']
H200_1025 = final_data_23_types[final_data_23_types['donor'] == 'H200.1025']
H200_1030 = final_data_23_types[final_data_23_types['donor'] == 'H200.1030']
#%%do statistics, HERE SELECT WHICH DATAFRAME YOU WANT TO USE DONOR DATA OR COMBINED DATA
#import statistical packages
import scikit_posthocs as sp
import pingouin as pg
#do Kruskal-Wallis and Dunn for single cell data
krus = pg.kruskal(data=df, dv='avg', between='cluster_new', detailed=True)
dunn = pg.pairwise_tests(dv='avg', between='cluster_new', data=df, padjust='holm', parametric=False)
#save the statistics as csv
krus.to_csv(r'path/kruskal_result.csv')
dunn.to_csv(r'path/dunn_result.csv')
#perforam Friedmann test for data per donor
fried = pg.friedman(data=df, dv='avg', between='region', within='donor')
dunn = pg.pairwise_tests(dv='avg', between='region', data=df, padjust='holm', parametric=False)
#save the statistics as csv
fried.to_csv(r'path/friedman_result.csv')
dunn.to_csv(r'path/dunn_fried_result.csv')
36 changes: 36 additions & 0 deletions RNAseq_analysis/RNA_seq_data_normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 7 09:43:56 2022
proces large matrices of raw counts into log10CPM values
and write to a new matrix
@author: stand & annagalakhova
"""

import pandas as pd
import numpy as np
import dask.dataframe as dd
from pathlib import Path
import os

filepath = Path(r'C/Users/annagalakhova/PhD INF CNCR VU/DATA/transcriptomics/RNAseq/VISp_mouse')

df = pd.read_csv(r"/Users/annagalakhova/PhD INF CNCR VU/DATA/transcriptomics/RNAseq/VISp_mouse/mouse_VISp_2018-06-14_intron-matrix.csv", chunksize=2000)
filename = "/Users/annagalakhova/PhD INF CNCR VU/DATA/transcriptomics/RNAseq/VISp_mouse/mouse_VISp_2018-06-14_intron-matrix_CPMlog10.csv"
print('iterator made')
c=0
header = True
for chunk in df:
print(c)
libsize = np.sum(chunk.iloc[:,1:].values, axis=1)
logcpm = np.log10(((chunk.iloc[:,1:].T / libsize).T *1_000_000) + 1)
logcpm = logcpm.astype('float32') #reduce memory from float64
logcpm.insert(0, 'sample_name', chunk.iloc[:,0])
if c == 0:
logcpm.to_csv(filename, index = False)
c+=1
else:
logcpm.to_csv(filename, mode='a', header=False, index = False)
c+=1

129 changes: 129 additions & 0 deletions RNAseq_analysis/regions_revision_v3_script_publish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 1 10:16:26 2023
This script is used to analyze and plot subsets of the AIBS RNA-seq database (https://portal.brain-map.org/atlases-and-data/rnaseq).
MULTIPLE CORTICAL AREAS - SMART-SEQ (2019) full matrix (exons+introns) is used to sub-select expression values
for specific genes of interest per region or cell classes.
Script structure:
24 - 38 selecting the data based on gene set of interest.
39 - 43 create datasets based on gene-set of interest (can be save and used later)
44 - 61 Load in selected data and add values to metadata
62 - 69 Group data by donor, allowing for plots of donordata seperate
70 - 97 plot the data
98 - end statistical evalutation
note. script on normaliztion of this dataset can be found in FINAL_RNA_seq_data_nromalization.py
@author: Stan Driessens & Anna Galakhova
"""
#load packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#read and import data from regions
#import metadata
metadata = pd.read_csv(r'path/metadata.csv')
metadata = metadata.set_index('sample_name')
#clear the metadata
metadata_cleared = metadata[metadata['class_label'].notnull()]
#%% create gene-set data from normalized dataframe
geneset = pd.read_csv(r'path/geneset_IQ.txt', names=['gene'], header=None , quotechar="'")
#in case there are duplicates present in the gene set, remove and update the gene set
geneset = geneset.gene.drop_duplicates()
#convert geneset to a dataframe to improve compatibillity with pandas
geneset = pd.DataFrame(geneset)
#%% load and filter data based on gene-set (ONLY NEED TO DO THIS ONCE, THEN SAVE PRE MADE DATA AS .CSV)
# selecting expression data of the genes present in the gene set of interest (geneset)
skiprows_set1 = metadata_cleared.loc[~metadata_cleared.sample_name.isin(metadata_cleared.sample_name)].index+1
#load in the data, NOTE. here we used pre-normalized log10 CPM data.
data_log10CPM = pd.read_csv(r'path/matrix_log10_CPM.csv',
skiprows=skiprows_set1, usecols=geneset.gene)
#save the pre made data
data_log10CPM.to_csv(r'path/your_data.csv')
#%%load in the pre-made normalized and pre-assigned gene-set data
data = pd.read_csv(r'path/your_data.csv')
#create avg expression and add to metadata, align index on sample_name
data = data.set_index('sample_name')
#drop unnamed 0
data = data.drop('Unnamed: 0', axis=1)
data['avg'] = data.mean(axis=1)
#select only cpm values
data_cpm = data.avg
#concatenate metadata to avg values from this geneset
b = pd.concat([metadata_cleared, data_cpm], axis = 1 , sort = True)
#create new column for region values where M1 and S1 sub-regions are merged to new M1 and S1
b.loc[b['region_label'].str.contains('M1'), 'region'] = 'M1'
b.loc[b['region_label'].str.contains('MTG'), 'region'] = 'MTG'
b.loc[b['region_label'].str.contains('A1'), 'region'] = 'A1'
b.loc[b['region_label'].str.contains('CgG'), 'region'] = 'CgG'
b.loc[b['region_label'].str.contains('S1'), 'region'] = 'S1'
b.loc[b['region_label'].str.contains('V1'), 'region'] = 'V1'
#%% Sub-select data per donor (to plot median experssion per donor)
#here, select which cell class to plot (i.e. Glutamatergic, GABAergic or Non-neuronal)
data_plot = b.loc[b['class_label'] == 'Glutamaterigc']
data_donor = b.loc[b['class_label'] == 'Glutamatergic']
#assign median expression per donor
median_vals = data_donor.groupby(['region', 'external_donor_name_order', 'class_label'])['avg'].median()
median_df = pd.DataFrame(median_vals)
median_df.reset_index(inplace=True)
#%%plotting the data
#plot a violin for gene expression for all cells
ax = sns.violinplot(data=data_plot, x='class_label', y='avg', legend=None, inner = None,
order=['Glutamatergic', 'GABAergic', 'Non-neuronal'], color='lightgrey', linewidth = 0 )


ax = sns.pointplot(data = median_df, x='class_label', y='avg', order=['Glutamatergic', 'GABAergic', 'Non-neuronal'],
hue = 'external_donor_name_order', markers=['o','s','^'], color='k', join=False, linestyles='--',
dodge=True,scale=1.3, errwidth=0)

ax.set_ylim((median_df.avg.mean()-0.25), median_df.avg.mean()+0.25)
ax.set_ylabel('Mean gene expression log10 (CPM+1)')

for l in ax.lines:
l.set_linestyle('--')
l.set_linewidth(0)
l.set_color('red')
l.set_alpha(0.5)
for l in ax.lines[1::3]:
l.set_linestyle('--')
l.set_linewidth(1)
l.set_color('black')
l.set_alpha(0.5)
plt.legend([],[], frameon=False)
#save the plot
ax.set_ylim((median_df.avg.mean()-0.25), median_df.avg.mean()+0.25)
ax.set_ylabel('Mean gene expression log10 (CPM+1)')
plt.savefig(r'path/your_plot.eps')
#%%statistical evalutation
#Friedman test on the separate donors
#split the group in separate donors
#do Friedman test for geneset - class - regions
# get the data per patient
data_donor_1 = b.loc[b['external_donor_name_order'] == 1]
data_donor_2 = b.loc[b['external_donor_name_order'] == 2]
data_donor_3 = b.loc[b['external_donor_name_order'] == 3]
#get_class
#hard code which class to use for statsitics: Glutamatergic, GABAergic or Non-neuronal
data_donor_1_df = data_donor_1.loc[data_donor_1['class_label'] == 'Glutamatergic']
data_donor_2_df = data_donor_2.loc[data_donor_2['class_label'] == 'Glutamatergic']
data_donor_3_df = data_donor_3.loc[data_donor_3['class_label'] == 'Glutamatergic']

#%%do statistics, HERE SELECT WHICH DATAFRAME YOU WANT TO USE DONOR DATA OR COMBINED DATA
df = data_donor_1_df
#import staistical packages
import scikit_posthocs as sp
import pingouin as pg
#perform Kruskal-Wallis and Dunn tests
krus = pg.kruskal(data=df, dv='avg', between='region', detailed=True)
dunn = pg.pairwise_tests(dv='avg', between='region', data=df, padjust='holm', parametric=False)
#save the statistics as csv
krus.to_csv(r'path/kruskal_result.csv')
dunn.to_csv(r'path/dunn_result.csv')
#perform Friedman test
fried = pg.friedman(data=df, dv='avg', between='region', within='donor')
dunn = pg.pairwise_tests(dv='avg', between='region', data=df, padjust='holm', parametric=False)
#save the statistics as csv
fried.to_csv(r'path/friedman_result.csv')
dunn.to_csv(r'path/dunn_fried_result.csv')

0 comments on commit 2dfd8d3

Please sign in to comment.