## Read one hot dataset

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

In [None]:
import pyspark.sql.functions as F
import os

data_folder = '/mnt/2024-team1/'


In [None]:
# read the one hot encoded file
from pyspark.sql.types import *

one_hot_path = data_folder + "JanBDRcount_transpose_onehot.csv"

df = spark.read.csv(one_hot_path, header=True)

In [None]:
display(df)


In [None]:
df.printSchema()

In [None]:
df.count()

## Cast String to int

In [None]:
df = df.select(F.col("column"), F.col("index"), *[F.col(c).cast(IntegerType()) for c in df.columns[2:]])

In [None]:
df.printSchema()

## Chi2 test

### take out target and broadcast to all partitions
- target is "PHENOTYPE" column

In [None]:
df_no_target = df.filter("column != 'PHENOTYPE'")

# check no target exist
# df_no_target.filter('column = "PHENOTYPE"').count()

In [None]:
df_target = df.filter('column = "PHENOTYPE"')

display(df_target)

In [None]:
target_np = np.array(df_target.collect())[0,2:].astype(int)

In [None]:
target_bc = sc.broadcast(target_np)
print(target_bc.value)

### Perform Chi2 test

In [None]:
df_cols = df_no_target.columns
from sklearn.feature_selection import chi2

def do_chi2_test(partition_iter):

  # 1. change the partition_iter into a pandas dataframe
  partition_df = pd.DataFrame(partition_iter, columns=df_cols)

  # 2. perform chi2 test
  p_values = partition_df.apply(lambda x: (f"{x[0]}_{x[1]}", float(chi2_contingency(pd.crosstab(x[2:], target_bc.value))[1])), axis=1).tolist()
  return p_values

In [None]:
df_no_target_rdd = df_no_target.rdd.cache()

In [None]:
pvalues_rdd = df_no_target_rdd.mapPartitions(do_chi2_test)

In [None]:
# 8 min

# take() to make sure the function works well

pvalues_rdd.take(1)

In [None]:
from pyspark.sql.types import *

schema = StructType([
  StructField("feature", StringType(), True),
  StructField("pvalue", FloatType(), True)
])

pvalues_df = spark.createDataFrame(pvalues_rdd, schema)


In [None]:
display(pvalues_df)

In [None]:
# filter
pvalues_df_filtered_001 = pvalues_df.filter(pvalues_df.pvalue < 0.01)

pvalues_df_filtered_001.take(5)

In [None]:
# 25 min
p_values = pvalues_df_filtered_001.collect()

In [None]:
p_values

In [None]:
p_values_dict = {k: v for d in p_values for k, v in d.items()}

sorted_p_value = dict(sorted(p_values_dict.items(), key=lambda item: item[1]))
sorted_p_value

In [None]:
# ensure the p-value is smaller than 0.01

p_value_001 = {k: v for k, v in p_values_dict.items() if v < 0.01}
p_value_001

In [None]:
len(p_value_001)

## Write p-values to file for future use

In [None]:
df_p_value_001 = spark.createDataFrame(pd.DataFrame(p_value_001.items()))
df_p_value_001.write.json(data_folder + "p_value_001.json", mode="overwrite")

In [None]:
display(pd.DataFrame(p_value_001.items()))

In [None]:
import seaborn as sns

bar = sns.barplot(pd.DataFrame(p_value_001.items()), x=0, y=1)
bar.set(xticklabels=[x for x in range(len(p_value_001))])

### (Additional) Part to investigate local approach for chi2 test
- This part is to investigate how it works for performing chi2 test in transposed data

In [None]:
df_2_only = df_no_target_rdd.take(2)

In [None]:
partition_df = pd.DataFrame(df_2_only, columns=df_cols)
partition_df

In [None]:
np.unique(target_bc.value)

In [None]:
pd.crosstab(partition_df.iloc[0, 2:], target_bc.value)  # index, column

In [None]:
from scipy.stats import chi2_contingency

partition_df.apply(lambda x: {f"{x[0]}_{x[1]}": chi2_contingency(pd.crosstab(x[2:], target_bc.value))[1]}, axis=1).tolist()