# Fit models with transposed data

## Read one hot dataset

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency


In [None]:
import pyspark.sql.functions as F
import os

data_folder = '/mnt/2024-team1/'

csv_data = 'JanBDRcount_transpose.csv'
raw_path = os.path.join(data_folder, csv_data)
raw_path

In [None]:
# read the one hot encoded file
from pyspark.sql.types import *

one_hot_path = data_folder + "JanBDRcount_transpose_onehot.csv"

df = spark.read.csv(one_hot_path, header=True)

In [None]:
display(df)


In [None]:
df.count()

## Read features with p-value < 0.01

In [None]:
df_p_value_001 = spark.read.json(data_folder + "p_value_001.json")

In [None]:
df_p_value_001 = df_p_value_001.sort("1")

In [None]:
df_p_value_001_with_feat = df_p_value_001.withColumn("feat", F.expr("substring(`0`, 1, length(`0`)-2)"))


In [None]:
display(df_p_value_001_with_feat)

In [None]:
5e-8

In [None]:
feat_pvalue_001 = df_p_value_001_with_feat.filter(F.col("1") < 1e-4).select("0").collect()

feat_pvalue_001

In [None]:
len(feat_pvalue_001)

## Cast String to Int

In [None]:
df = df.select(F.col("column"), F.col("index"), *[F.col(c).cast(IntegerType()) for c in df.columns[2:]]).cache()

In [None]:
df.printSchema()

## Combine column and index

In [None]:
# reorder columns
df_combine_feat_index = df.select( 
                                F.when(F.col('column') != "PHENOTYPE", 
                                        F.concat(F.col('column'), F.lit("_"), F.col('index')))\
                                        .otherwise("PHENOTYPE").alias("feat_index"), "*")\
                                .drop('column', 'index')

In [None]:
display(df_combine_feat_index)

## Filter features with p-value

In [None]:
feat_li = [feat[0] for feat in feat_pvalue_001]
feat_li = list(set(feat_li))
feat_li


In [None]:
# add label to feature list
feat_li.append("PHENOTYPE")

In [None]:
df_filtered = df_combine_feat_index.filter(F.col('feat_index').isin(feat_li))

In [None]:
display(df_filtered)


In [None]:
num_of_feat = df_filtered.count()
num_of_feat

## Transpose data
- Two methods, one convert to Pandas before transposing, the other transpose within PySpark

In [None]:
"""
Method 1:
Convert PySpark df to Pandas to transpose
"""

import pandas as pd

header_col = 'feat_index'

partition_df = df_filtered.toPandas()

partition_df_trans = (partition_df
                        .set_index(header_col)
                        .T
                        .reset_index()
                        .rename(columns={"index":header_col})
                        .drop(header_col, axis=1))

df_trans_back = spark.createDataFrame(partition_df_trans)



In [None]:
df_trans_back = df_trans_back.withColumn(
    'PHENOTYPE',
    F.when(df_trans_back.PHENOTYPE == 2, 0).otherwise(1)
).cache()

In [None]:
partition_df_trans

In [None]:
"""
Method 2:
Transpose within PySpark using groupby and pivot
Remove the comment of following code to run
"""

# header_col = 'feat_index'
# cols_minus_header = df_filtered.columns
# cols_minus_header.remove(header_col)

# spark.conf.set('spark.sql.pivotMaxValues', num_of_feat)

# df_temp = (df_filtered
#        .groupBy()
#        .pivot(header_col)
#        .agg(F.first(F.array(cols_minus_header)))
#        .withColumn(header_col, F.array(*map(F.lit, cols_minus_header)))
#       )

# feat_col = df_temp.columns
# feat_col.remove(header_col)

# df_trans_back = df_temp.select(F.arrays_zip(*feat_col).alias('az')).selectExpr('inline(az)').cache()

In [None]:
display(df_trans_back)

## Fit Model

### Split train test data

In [None]:
# Check whether the data is balanced

df_trans = df_trans_back.groupBy('PHENOTYPE').agg(F.count('PHENOTYPE'))
df_trans.show()

In [None]:
train_data, test_data = df_trans_back.randomSplit(weights = [0.9, 0.1], seed = 555)
print('Train:',train_data.count())
print('Test:',test_data.count())

In [None]:
len(train_data.columns)

### Cross Validation Global

In [None]:
# Split data into n pieces

fold = 10

data_splited_li = df_trans_back.randomSplit(weights = [1.0]*fold, seed = 555)
len(data_splited_li)

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LinearSVC

import numpy as np

input_features = df_trans_back.columns
input_features.remove("PHENOTYPE")

test_score_li = []
train_score_li = []

for i in range(len(data_splited_li)):
    test_data = data_splited_li[i]
    train_data_li = [data_splited_li[j] for j in range(len(data_splited_li)) if j!=i]

    train_data = train_data_li[0]
    for df_next in train_data_li[1:]:
        train_data = train_data.union(df_next)

    '''
    Remove the comment of the model to use
    '''

    assembler = VectorAssembler(inputCols=input_features, outputCol='features')
    # rf = RandomForestClassifier(featuresCol = 'features', labelCol= 'PHENOTYPE', numTrees=100, maxDepth=5, seed=42)
    lg = LogisticRegression(featuresCol = 'features', labelCol= 'PHENOTYPE')
    # svm = LinearSVC(featuresCol = 'features', labelCol= 'PHENOTYPE')

    model = lg.fit(assembler.transform(train_data).select('PHENOTYPE','features'))

    # get testing result
    predictions = model.transform(assembler.transform(test_data))

    # get training result
    predictions_train = model.transform(assembler.transform(train_data))

    # define the evaluator
    evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC", labelCol="PHENOTYPE", rawPredictionCol="prediction")

    train_score_li.append(evaluator.evaluate(predictions_train))
    test_score_li.append(evaluator.evaluate(predictions))

print("training score: ", train_score_li, sep='\n')
print("testing score: ", test_score_li, sep='\n')

print("training mean: ", np.mean(train_score_li), " std: ", np.std(train_score_li))
print("testing mean: ", np.mean(test_score_li), " std: ", np.std(test_score_li))


### Cross Validation Local

In [None]:
from pyspark.ml.feature import VectorAssembler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# set the number of partition, which is the number of model in Ensemble learning
num_of_partition = 3


df_cols = df_trans_back.columns

def build_model(partition_iter):

    partition_df = pd.DataFrame(partition_iter, columns=df_cols)

    if(partition_df.shape[0] <= 0):
        # the df is empty
        return []

    X_train = partition_df.loc[:, partition_df.columns != "PHENOTYPE"]
    y_train = partition_df["PHENOTYPE"]

    # Change between LogisticRegression and LineraSVC model
    rf = LogisticRegression(random_state=555)  # LinearSVC(random_state=555)
    
    model = rf.fit(X_train,y_train)

    return [model]


def predict(instance):

    inst_features = instance[:-1]  # exclude target

    # make a prediction with each model
    predictions = [m.predict([inst_features])[0] for m in models]

    return predictions


def major_vote(lst):
    return max(set(lst), key=lst.count)


from pyspark.sql import Row

def transform(instance):
    # create a new Row from the instance Row and the aggregated prediction
    return Row(**instance.asDict(),\
        prediction=float(major_vote(predict(instance))))


test_score_li = []
train_score_li = []

for i in range(len(data_splited_li)):
    test_data = data_splited_li[i]
    train_data_li = [data_splited_li[j] for j in range(len(data_splited_li)) if j!=i]

    train_data = train_data_li[0]
    for df_next in train_data_li[1:]:
        train_data = train_data.union(df_next)

    train_data_rdd = train_data.coalesce(num_of_partition).rdd.cache()
    print("number of partition for training: ", train_data_rdd.getNumPartitions())

    models = train_data_rdd.mapPartitions(build_model).collect()

    # test
    test_data_rdd = test_data.rdd.cache()
    
    # make testing result into form
    pred_df = test_data_rdd.map(transform).toDF()

    # get training roc
    pred_df_train = train_data_rdd.map(transform).toDF()

    # define evaluator
    evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC", labelCol="PHENOTYPE", rawPredictionCol="prediction")
  
    train_score_li.append(evaluator.evaluate(pred_df_train))
    test_score_li.append(evaluator.evaluate(pred_df))


print("training score: ", train_score_li, sep='\n')
print("testing score: ", test_score_li, sep='\n')

print("training mean: ", np.mean(train_score_li), " std: ", np.std(train_score_li))
print("testing mean: ", np.mean(test_score_li), " std: ", np.std(test_score_li))