## Read one-hot csv

In [0]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

In [0]:
import pyspark.sql.functions as F
import os

data_folder = '/mnt/2024-team1/'

csv_data = 'JanBDRcount_transpose.csv'
raw_path = os.path.join(data_folder, csv_data)
raw_path

In [0]:
# read the one hot encoded file
from pyspark.sql.types import *

one_hot_path = data_folder + "JanBDRcount_transpose_onehot.csv"

df = spark.read.csv(one_hot_path, header=True)

In [0]:
display(df)


## String to int

In [0]:
df = df.select(F.col("column"), F.col("index"), *[F.col(c).cast(IntegerType()) for c in df.columns[2:]]).cache()

In [0]:
df.count()

In [0]:
df.rdd.getNumPartitions()

## Size-up measure decorator

In [0]:
def measure_size_up(func):
  import time

  def inner(df, split_count=10, num_of_partition = None):
    if num_of_partition == None:
      num_of_partition = df.rdd.getNumPartitions()
    
    # list for record all size-up value
    size_up_li = []

    for i in range(1, split_count+1):
      df_small = df.sample((i/split_count))

      print(df_small.count())
      print(df_small.rdd.getNumPartitions())
      
      # perform operations and count the run time
      start_time = time.time()
      func(df_small)

      size_up_li.append(time.time() - start_time)
  
    return size_up_li
  
  return inner


## Speed up measure decorator

In [0]:
def measure_speed_up(func):
    import time

    def inner(df):
        num_of_partition = df.rdd.getNumPartitions()
        print("total number of partition: ", num_of_partition)

        target_num_of_partition = 16
        # list for record all scale-up value
        speed_up_li = []

        for i in range(5, target_num_of_partition + 1):
            if i <= num_of_partition:
                df_small_partition = df.coalesce(i)
            else:
                df_small_partition = df.repartition(i)
            # check we changed the number of partition
            print(df_small_partition.count())
            print(df_small_partition.rdd.getNumPartitions())

            # perform operations and count the run time
            start_time = time.time()
            func(df_small_partition)
            speed_up_li.append(time.time() - start_time)
            print(time.time() - start_time)

        return speed_up_li

    return inner

## Measure Chi2 Test

### Broadcast target

In [0]:
# feature to be broadcasted
df_no_target = df.filter("column != 'PHENOTYPE'")

df_target = df.filter('column = "PHENOTYPE"')

target_np = np.array(df_target.collect())[0,2:].astype(int)

target_bc = sc.broadcast(target_np)
print(target_bc.value)

### Size-up

In [0]:
@measure_size_up
def chi2_feature_selection(df_no_target):

  df_cols = df_no_target.columns
  from sklearn.feature_selection import chi2

  def do_chi2_test(partition_iter):

    # change the partition_iter into a pandas dataframe
    partition_df = pd.DataFrame(partition_iter, columns=df_cols)

    # perform chi2 test
    p_values = partition_df.apply(lambda x: {f"{x[0]}_{x[1]}": chi2_contingency(pd.crosstab(x[2:], target_bc.value))[1]}, axis=1).tolist()
    return p_values
  
  df_no_target_rdd = df_no_target.rdd.cache()
  models = df_no_target_rdd.mapPartitions(do_chi2_test)

  p_values = models.collect()


In [0]:
size_up_li = chi2_feature_selection(df_no_target)
size_up_li

In [0]:
size_up_li = [96.6624825000763,
 153.14934420585632,
 165.44122505187988,
 226.1877691745758,
 284.008672952652,
 290.5138041973114,
 404.09003734588623,
 572.6911044120789,
 395.5881383419037,
 637.9775326251984]

import matplotlib.pyplot as plt

# plt.plot(range(len(size_up_li)), size_up_li)
plt.plot([x * 0.1 for x in range(1, len(size_up_li)+1)], size_up_li)
plt.xlabel("percent of data")
plt.ylabel("Run Time (sec)")
plt.show()

### Speed-up

In [0]:
@measure_speed_up
def chi2_feature_selection(df_no_target):

  df_cols = df_no_target.columns
  from sklearn.feature_selection import chi2

  def do_chi2_test(partition_iter):

    # 1. change the provided iterator `partition_iter` into a pandas dataframe
    partition_df = pd.DataFrame(partition_iter, columns=df_cols)

    # perform chi2 test
    p_values = partition_df.apply(lambda x: {f"{x[0]}_{x[1]}": chi2_contingency(pd.crosstab(x[2:], target_bc.value))[1]}, axis=1).tolist()
    return p_values
  
  df_no_target_rdd = df_no_target.rdd.cache()
  models = df_no_target_rdd.mapPartitions(do_chi2_test)
    # 25 min for full dataset
  p_values = models.collect()


In [0]:
speed_up_li = chi2_feature_selection(df_no_target)

In [0]:
speed_up_li

In [0]:
import matplotlib.pyplot as plt

speed_up_li = [1509.6186542510986,
 1596.1182825565338,
 1595.4099326133728,
 1959.622414112091,
 1091.4868171215057,
 1521.2955095767975,
 1244.9953787326813,
 1147.015658378601,
 1411.430251121521,
 1183.6906626224518,
 1299.2799725532532,
 1130.5113909244537]

plt.plot(range(3, 3 + len(speed_up_li)), speed_up_li)
plt.xlabel("Number of Clusters")
plt.ylabel("Run Time (sec)")
plt.show()