**Name: Rabin BK <br/>
Matriculation number: 23272000**

# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

# Step 1: Load and filter data

In [2]:
gene_exp_df = pd.read_csv('TCGA-BRCA.htseq_fpkm.tsv',
                            sep='\t')
phenotype_df = pd.read_csv('TCGA-BRCA.GDC_phenotype.tsv',
                            sep='\t', index_col=0)

Since we want to map the `gene_exp_df` with sample types in `phenotype_df` frame we transpose the `gene_exp_df` dataframe. In order to do that,
- Firstly we transpose the `gene_exp_df` and reset the index. This will transpose the data frame but will have integers as index and column headers
- So, w re-assign the columns with values in the 0 integer location using `.iloc[]`
- Then drop the 0th axis
- Set `Enembl_ID` as our index 

In [3]:
# transposing the column and reseting the index
gene_exp_df = gene_exp_df.T.reset_index()

# Rename the columns
gene_exp_df.columns = gene_exp_df.iloc[0]
gene_exp_df = gene_exp_df.drop(0)
gene_exp_df = gene_exp_df.set_index('Ensembl_ID')

In [4]:
gene_exp_df.head()

Unnamed: 0_level_0,ENSG00000242268.2,ENSG00000270112.3,ENSG00000167578.15,ENSG00000273842.1,ENSG00000078237.5,ENSG00000146083.10,ENSG00000225275.4,ENSG00000158486.12,ENSG00000198242.12,ENSG00000259883.1,...,ENSG00000238244.3,ENSG00000186115.11,ENSG00000216352.1,ENSG00000267117.1,ENSG00000273233.1,ENSG00000105063.17,ENSG00000231119.2,ENSG00000280861.1,ENSG00000123685.7,ENSG00000181518.3
Ensembl_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-E9-A1NI-01A,0.091708,0.019573,2.235898,0.0,2.321945,3.620056,0.0,0.337087,7.705589,0.084661,...,0.0,0.073008,0.0,0.0,0.0,3.680055,0.28564,0.0,0.599579,0.0
TCGA-A1-A0SP-01A,0.0,0.004701,1.863334,0.0,4.226699,3.546117,0.0,0.016016,6.835508,0.0,...,0.0,0.0,0.0,0.105328,0.055477,3.969785,0.115149,0.0,1.382192,0.0
TCGA-BH-A1EU-11A,0.057899,0.016302,1.704753,0.0,1.975755,3.396943,0.0,0.041455,7.12531,0.461624,...,0.0,0.039503,0.0,0.092108,0.0,3.011921,0.384451,0.0,0.629043,0.0
TCGA-A8-A06X-01A,0.0,0.0,1.947481,0.0,2.808757,4.72327,0.0,0.002361,7.259318,0.088912,...,0.0,0.118749,0.0,0.0,0.0,4.059347,0.345883,0.0,0.396315,0.0
TCGA-E2-A14T-01A,0.0,0.0,2.73469,0.0,1.964479,3.770091,0.0,0.111647,7.643035,0.066036,...,0.0,0.0,0.0,0.113546,0.0,4.249147,0.065679,0.0,0.157504,0.0


We will read the primary tumor samples and healthy tissue samples from the `phenotype_df` 

In [5]:
# getting primary tumor samples and healthy tissue samples

primary_tumor_samples = phenotype_df[phenotype_df['sample_type.samples'] == 'Primary Tumor']
healthy_tissue_samples = phenotype_df[~(phenotype_df['sample_type.samples'] == 'Primary Tumor')]

First we'll find all the common samples in the `primary_tumor_samples` and `gene_exp_df` (both of these DataFrames have sample ID as index).<br>
Then will change that into a list. Then we will filter out all group of rows and columns that matches this list from the `gene_exp_df`

In [6]:
# Get the intersection of indices between primary_tumor_samples.index and gene_exp_df.index
common_samples = set(primary_tumor_samples.index).intersection(gene_exp_df.index)

# list of all common samples
common_samples_list = list(common_samples)

# Filter out the common indices from gene_exp_df
primary_tumor_df = gene_exp_df.loc[common_samples_list]
primary_tumor_df.head()

Unnamed: 0_level_0,ENSG00000242268.2,ENSG00000270112.3,ENSG00000167578.15,ENSG00000273842.1,ENSG00000078237.5,ENSG00000146083.10,ENSG00000225275.4,ENSG00000158486.12,ENSG00000198242.12,ENSG00000259883.1,...,ENSG00000238244.3,ENSG00000186115.11,ENSG00000216352.1,ENSG00000267117.1,ENSG00000273233.1,ENSG00000105063.17,ENSG00000231119.2,ENSG00000280861.1,ENSG00000123685.7,ENSG00000181518.3
Ensembl_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-E9-A2JT-01A,0.071484,0.010133,2.658384,0.0,2.109925,4.745907,0.0,0.064615,7.778855,0.129028,...,0.0,0.0,0.0,0.0,0.0,5.13629,0.355149,0.0,1.035771,0.0
TCGA-A2-A3XZ-01A,0.0,0.0,2.393297,0.0,2.608379,4.173291,0.0,0.026437,7.371801,0.11694,...,0.0,0.0,0.0,0.600884,0.106221,4.67755,0.088107,0.0,1.221382,0.0
TCGA-AN-A0AS-01A,0.0,0.0,1.321787,0.0,1.973319,3.390694,0.0,0.02331,7.162393,0.068497,...,0.0,0.008581,0.0,0.0,0.178879,3.71792,0.142241,0.0,0.809907,0.0
TCGA-A8-A08C-01A,0.0,0.0,1.620849,0.0,1.720361,4.288279,0.0,0.006328,7.713296,0.0,...,0.0,0.00753,0.0,0.052809,0.0,3.572521,0.273864,0.0,0.485406,0.0
TCGA-D8-A27E-01A,0.0,0.0,2.011465,0.0,1.824982,3.485797,0.0,0.007545,7.427858,0.183529,...,0.0,0.005991,0.0,0.083102,0.043611,3.823495,0.76642,0.0,0.529424,0.0


We perform similar steps for healthy tissue samples as well.

In [7]:
# Get the intersection of indices between healthy_tissue_samples.index and gene_exp_df.index
common_tissue_samples = set(healthy_tissue_samples.index).intersection(gene_exp_df.index)

# list of all common tissue samples
common_tissues_list = list(common_tissue_samples)

# Filter out the common indices from gene_exp_df
healthy_tissue_df = gene_exp_df.loc[common_tissues_list]
healthy_tissue_df.head()

Unnamed: 0_level_0,ENSG00000242268.2,ENSG00000270112.3,ENSG00000167578.15,ENSG00000273842.1,ENSG00000078237.5,ENSG00000146083.10,ENSG00000225275.4,ENSG00000158486.12,ENSG00000198242.12,ENSG00000259883.1,...,ENSG00000238244.3,ENSG00000186115.11,ENSG00000216352.1,ENSG00000267117.1,ENSG00000273233.1,ENSG00000105063.17,ENSG00000231119.2,ENSG00000280861.1,ENSG00000123685.7,ENSG00000181518.3
Ensembl_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-AC-A2FF-11A,0.026274,0.010995,1.876315,0.0,2.39278,3.469784,0.0,0.050692,7.848308,0.225574,...,0.0,0.02965,0.0,0.122694,0.043527,3.543504,0.357622,0.0,0.589776,0.0
TCGA-BH-A0DT-11A,0.234169,0.0,1.928209,0.0,2.024599,3.422728,0.088664,0.090204,7.209058,0.354119,...,0.0,0.140649,0.0,0.112452,0.059299,3.348868,0.329875,0.0,0.487383,0.0
TCGA-BH-A1FD-11B,0.797153,0.008413,1.466558,0.0,1.701458,3.086214,0.0,0.015787,7.310601,0.411371,...,0.0,0.006855,0.0,0.094729,0.0,2.827587,0.15809,0.0,0.476761,0.0
TCGA-BH-A1EV-11A,0.516086,0.009534,1.309972,0.0,2.055704,3.162926,0.0,0.153986,7.328111,0.511347,...,0.0,0.0,0.0,0.140881,0.0378,2.751942,0.329202,0.0,0.678316,0.0
TCGA-E9-A1RI-11A,0.859543,0.00407,1.101428,0.0,2.154707,2.081888,0.0,0.019387,6.711886,0.378523,...,0.0,0.0,0.0,0.046535,0.0,2.17782,0.213915,0.0,0.213952,0.0


# Step 2: Identify differentially expressed genes

- Find genes that are differentially expressed between primary tumor and healthy tissue
samples using the Mann-Whitney U test provided by SciPy:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html
- Correct for multiple testing by using an appropriate function provided by the statsmodels
package: https://www.statsmodels.org/stable/index.html.
- For each gene, compute the log2 fold change between primary tumor and healthy tissue
samples.
- Generate a pandas DataFrame with the results which, for each gene, contains the
P-value (from the Mann-Whitney U test), the adjusted P-value (from the multiple testing
correction) and the log2 fold change.
- Write the results to a csv file.

In [8]:
from scipy.stats import mannwhitneyu