In [37]:
import pandas as pd
from scipy.stats import zscore
import seaborn as sns
from scipy.cluster.hierarchy import fcluster
import numpy as np

In [38]:
header = ['Naive 1', 'Naive 2', 'Naive 3', 'Naive 4', 'Early Pre-TFH 1', 'Early Pre-TFH 2', 'Early Pre-TFH 3', 'Early Pre-TFH 4', 'Late Pre-TFH 1', 'Late Pre-TFH 2', 'Late Pre-TFH 3', 'Late Pre-TFH 4', 'GC 1', 'GC 2', 'GC 3', 'GC 4']

rna_seq = pd.read_csv('/ix/djishnu/Alisa/Tfh/Run_313.atac-seq-correl.rsem.pe.hg38.ensembl.counts.matrix.results.normalized_data_matrix', skiprows = [0], names=header, sep ='\t')
rna_seq = rna_seq.reset_index()

rna_seq = rna_seq.rename(columns={'index': "Gene"})
rna_seq[['Gene_ID', 'Genes']] = rna_seq.Gene.str.split("_",n=1, expand=True)

In [39]:
rna_seq['ATAC_Naive'] = rna_seq[['Naive 1', 'Naive 2', 'Naive 3', 'Naive 4']].mean(axis=1) + 0.000001
rna_seq['ATAC_Early_Pre-TFH'] = rna_seq[['Early Pre-TFH 1', 'Early Pre-TFH 2', 'Early Pre-TFH 3', 'Early Pre-TFH 4']].mean(axis=1) + 0.000001
rna_seq['ATAC_Late_Pre-TFH'] = rna_seq[['Late Pre-TFH 1', 'Late Pre-TFH 2', 'Late Pre-TFH 3', 'Late Pre-TFH 4']].mean(axis=1) + 0.000001
rna_seq['ATAC_GC'] = rna_seq[['GC 1', 'GC 2', 'GC 3', 'GC 4']].mean(axis=1) + 0.000001
rna_seq_mean = rna_seq[['Genes', 'ATAC_Naive','ATAC_Early_Pre-TFH', 'ATAC_Late_Pre-TFH', 'ATAC_GC']]

In [40]:
rna_seq_mean = rna_seq_mean.set_index('Genes')

In [41]:
rna_seq_mean['LogChange'] = np.log2(np.divide(rna_seq_mean["ATAC_GC"],rna_seq_mean["ATAC_Early_Pre-TFH"]))
rna_seq_mean = rna_seq_mean.reset_index()

#rna_seq_mean_filtered = rna_seq_mean[rna_seq_mean['total'] > threshold]
#print(rna_seq_mean_filtered)
rna_seq_sign_change = rna_seq_mean[['Genes', 'LogChange']]
print(rna_seq_sign_change)

               Genes  LogChange
0             TSPAN6 -17.554026
1               TNMD   0.000000
2               DPM1  -0.131166
3              SCYL3   2.485725
4           C1orf112  -2.487404
...              ...        ...
60670  CTD-2060L22.1 -16.784421
60671   RP11-107E5.4   0.000000
60672          HYMAI  20.806715
60673     RARRES2P11   0.000000
60674   RP11-299P2.2   0.000000

[60675 rows x 2 columns]


In [42]:
#Check direction:
print(rna_seq_mean[rna_seq_mean['Genes']=='BCL6'])

     Genes  ATAC_Naive  ATAC_Early_Pre-TFH  ATAC_Late_Pre-TFH     ATAC_GC  \
4239  BCL6   66.066535           91.347667         358.027455  403.335255   

      LogChange  
4239    2.14254  


In [None]:
rna_seq_sign_change.to_csv('/ix3/djishnu/Alisa/Tfh/correct_direction_sign_change_rna_earlyvsGC_log2FC.csv')

In [43]:
#Cap the LogFC values at abs(2):
log_change_thresh = []

for x in rna_seq_sign_change['LogChange']:
    if x <= -2:
        log_change_thresh.append(-2)
    if x >=2:
        log_change_thresh.append(2)
    if -2 < x < 2:
        log_change_thresh.append(x)

rna_seq_sign_change['LogChange_Thresh'] = log_change_thresh


In [None]:
rna_seq_sign_change.to_csv('/ix3/djishnu/Alisa/Tfh/correct_direction_sign_change_rna_earlyvsGC_Thresh_log2FC.csv')

In [None]:
#Remove the lowest 10%:
rna_seq_mean["total"]= rna_seq_mean.mean(axis=1)
threshold = rna_seq_mean['total'].quantile(0.10)

filtered_df = rna_seq_mean[rna_seq_mean['total'] > threshold]

In [None]:
# Function to map column names to scores
def map_to_scores(column_name):
    column_mapping = {'ATAC_Naive': 0, 'ATAC_Early_Pre-TFH': 1, 'ATAC_Late_Pre-TFH': 2, 'ATAC_GC': 3}
    return column_mapping[column_name]

max_columns = filtered_df.idxmax(axis=1)

# Map the column names to scores, and make a state specific label:
filtered_df['State'] = max_columns.apply(map_to_scores)
filtered_df.to_csv('/ix/djishnu/Alisa/Tfh/ForPaper/rna_state_specific.csv')

print(filtered_df)