In [None]:
spark

In [None]:
# set max columns, rows, column width in pandas so doesn't truncate
import pandas as pd
pd.set_option('display.max_colwidth',250) # or -1
pd.set_option('display.max_columns', None) # or 500
pd.set_option('display.max_rows', None) # or 500

# sets the cell width to 100% respective to the screen size
from IPython.core.display import display, HTML
from pyspark.sql.functions import when, col
from pyspark.sql.functions import avg
display(HTML("<style>.container { width:92% !important; }</style>"))
from pyspark.sql.functions import col, sum as spark_sum
import matplotlib.pyplot as plt

In [None]:
from pyspark.sql.functions import col, max
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, sum as spark_sum
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, log, exp
from pyspark.sql.functions import rand
from pyspark.sql import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col, expr

In [None]:
spark.sql('use CUA_db')

In [None]:
#call in dataframe--NOTE updated to include the table with consolidated features

cua_non= spark.sql("""
    SELECT *
    FROM consolidated_cua_non
""")
cua_non

In [None]:
cua_non.limit(10).toPandas()

## Alter numerics in Dataframe

In [None]:
#Altering age columns
##Chose scaled age based off of highest average feature importance

# Dividing by 100
scaled_divided = cua_non.withColumn("scaled_age", col("age") / 100)



In [None]:
#Altering BMI columns

# Dividing by 40 (although higher BMIs, this is a typical high range)
scaled_BMI = scaled_exp.withColumn("scaled_BMI", col("BMI") / 40)

In [None]:
##Drop features from the data frame that are no longer being used, pre previous feature analysis
rdxn=scaled_BMI.drop('AIAN', 'NHPI', 'Asian', 'MENA', 'Other', 'API_ethn', 
                     'Mixed', 'Metropol', 'Non_metro', 'no_metro', 'R0', 'R1', 'R2', 'R3', 'R4',
                    'R5', 'R6', 'R7', 'R8', 'R9', 'RU', 'PPROM', 'csect', 'ccsect', 'lsect', 'BMI')

## Create DF with equal groups to deal with imbalance classification

In [None]:
grouped_CUA = rdxn.filter(scaled_BMI['CUA_ANY'] == 1)  # Filter treatment group
grouped_non=rdxn.filter(scaled_BMI['CUA_ANY'] == 0)  # Filter control group

print(grouped_CUA.count())
print(grouped_non.count())

In [None]:
## order the grouped non and then randomize
ordered_and_randomized_df = grouped_non.orderBy(rand())


In [None]:
# Add a new column named '_index' as an index using row_number
window_spec = Window.orderBy("personid")  # Replace "any_column" with a column that defines the order
df_with_index = ordered_and_randomized_df.withColumn("_index", row_number().over(window_spec))


In [None]:
# Get the total number of rows in the original DataFrame
total_rows = df_with_index.count()

# Number of rows for each random DataFrame
rows_per_dataframe = 28462

start_index_df1 = 0
end_index_df1 = rows_per_dataframe

# Filter the original DataFrame for the first DataFrame
df1 = df_with_index.filter((col("_index") >= start_index_df1) & (col("_index") < end_index_df1))


In [None]:
df1.count()

In [None]:
# Calculate the start and end indices for the second DataFrame
start_index_df2 = end_index_df1
end_index_df2 = start_index_df2 + rows_per_dataframe

# Filter the original DataFrame for the second DataFrame
df2 = df_with_index.filter((col("_index") >= start_index_df2) & (col("_index") < end_index_df2))


In [None]:
# Calculate the start and end indices for the third DataFrame
start_index_df3 = end_index_df2
end_index_df3 = start_index_df3 + rows_per_dataframe

# Filter the original DataFrame for the third DataFrame
df3 = df_with_index.filter((col("_index") >= start_index_df3) & (col("_index") < end_index_df3))


In [None]:
# Calculate the start and end indices for the fourth DataFrame
start_index_df4 = end_index_df3
end_index_df4 = start_index_df4 + rows_per_dataframe

# Filter the original DataFrame for the fourth DataFrame
df4 = df_with_index.filter((col("_index") >= start_index_df4) & (col("_index") < end_index_df4))


In [None]:
# Calculate the start and end indices for the fifth DataFrame
start_index_df5 = end_index_df4
end_index_df5 = start_index_df5 + rows_per_dataframe

# Filter the original DataFrame for the fifth DataFrame
df5 = df_with_index.filter((col("_index") >= start_index_df5) & (col("_index") < end_index_df5))


In [None]:
df1a=df1.drop('_index')
df2a=df2.drop('_index')
df3a=df3.drop('_index')
df4a=df4.drop('_index')
df5a=df5.drop('_index')

In [None]:
## Recombine with the CUA df
random_non_cua1=grouped_CUA.union(df1a)
random_non_cua2=grouped_CUA.union(df2a)
random_non_cua3=grouped_CUA.union(df3a)
random_non_cua4=grouped_CUA.union(df4a)
random_non_cua5=grouped_CUA.union(df5a)

## RF Consolidated Rerun

## First run

In [None]:
#Chosen features based on previous feature analysis
binary_cols= ['Black', 'White', 'Hisp_Latino',
        'Other_plus', 'Unknown', 'urbn', 'rural', 'no_urban', 'dead','endo', 'infertility', 
        'RA', 'dysmen', 'Irregular', 'spinal', 'scoliosis',
        'hearing_loss', 'mc', 'EOM', 'hemato', 'HPV', 'HIV', 'STI', 'smoker', 'AA', 'CVD',
        'meno', 'preg', 'ectop', 'lynch', 'PCOS','Db2', 'any_csect', 'MCCLD', 'HRP',
        'FT_loss']
numerical_cols=['scaled_BMI', 'scaled_age']


In [None]:
assembler = VectorAssembler(
    inputCols=binary_cols + numerical_cols, 
    outputCol="features"
)

## First Run with df1

In [None]:
#Define the Random Forest model; 500 trees selected based off of article
rf = RandomForestClassifier(labelCol="CUA_ANY", featuresCol="features", numTrees=500)

##(this can be more complex when you need string indexer and one-hot encoder)
pipeline = Pipeline(stages=[assembler, rf])

(training_data, testing_data) = random_non_cua1.randomSplit([0.8, 0.2], seed=1234)

# Train the model
model = pipeline.fit(random_non_cua1)

#Make predictions on the testing set
predictions = model.transform(testing_data)

In [None]:
predictions_full1 = model.transform(rdxn)

In [None]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)

print(f"ROC AUC: {auc}")

evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

#ROC AUC: 0.8494792175389134
#Accuracy: 0.7712779745960425

In [None]:
#This analysis will print all of the features in order and their importance

# Feature Importance
feature_dict= {0:'Black', 1:'White', 2:'Hisp_Latino',
        3:'Other_plus', 4:'Unknown', 5:'urbn', 6:'rural', 7:'no_urban', 8:'dead', 9:'endo', 
        10:'infertility', 11:'RA', 12:'dysmen', 13:'Irregular', 14:'spinal', 15:'scoliosis',
        16:'hearing_loss', 17:'mc', 18:'EOM', 19:'hemato', 20:'HPV', 21:'HIV', 22:'STI', 
        23:'smoker', 24:'AA', 25:'CVD',26:'meno', 27:'preg', 28:'ectop', 29:'lynch', 30:'PCOS',
        31:'Db2', 32:'any_csect', 33:'MCCLD', 34:'HRP', 35:'FT_loss',
        36:'scaled_BMI', 37:'scaled_age'}


# Optional: Print feature importance with actual feature names
feature_importance = model.stages[-1].featureImportances
print("Original Feature Importance (minus previous zeros):")
for i, imp in enumerate(feature_importance.toArray()):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"{feature_name}: {imp}")

In [None]:
#This analysis will print all of the features in order of their rank, but will exclude feature importance =0

feature_dict= feature_dict= {0:'Black', 1:'White', 2:'Hisp_Latino',
        3:'Other_plus', 4:'Unknown', 5:'urbn', 6:'rural', 7:'no_urban', 8:'dead', 9:'endo', 
        10:'infertility', 11:'RA', 12:'dysmen', 13:'Irregular', 14:'spinal', 15:'scoliosis',
        16:'hearing_loss', 17:'mc', 18:'EOM', 19:'hemato', 20:'HPV', 21:'HIV', 22:'STI', 
        23:'smoker', 24:'AA', 25:'CVD',26:'meno', 27:'preg', 28:'ectop', 29:'lynch', 30:'PCOS',
        31:'Db2', 32:'any_csect', 33:'MCCLD', 34:'HRP', 35:'FT_loss',
        36:'scaled_BMI', 37:'scaled_age'}


# Optional: Print feature importance with actual feature names and rank
feature_importance = model.stages[-1].featureImportances
print("Ranked Feature Importance:")
# Filter out features with zero importance
non_zero_importance = [(i, imp) for i, imp in enumerate(feature_importance.toArray()) if imp > 0]
# Sort features by importance in descending order
sorted_features = sorted(non_zero_importance, key=lambda x: x[1], reverse=True)

for rank, (i, imp) in enumerate(sorted_features):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"Rank {rank + 1}: {feature_name} - Importance: {imp}")

## Second run with df2

In [None]:
##Define the Random Forest model; 500 trees selected based off of article
rf = RandomForestClassifier(labelCol="CUA_ANY", featuresCol="features", numTrees=500)


pipeline = Pipeline(stages=[assembler, rf])

(training_data, testing_data) = random_non_cua2.randomSplit([0.8, 0.2], seed=1234)

# Train the model
model = pipeline.fit(random_non_cua2)

# Make predictions on the testing set
predictions = model.transform(testing_data)

In [None]:
predictions_full2 = model.transform(rdxn)

In [None]:
##Evaluate the model

evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"ROC AUC: {auc}")

evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

#ROC AUC: 0.8485757413645247
#Accuracy: 0.7718828307266914

In [None]:
#This analysis will print all of the features in order and their importance

feature_dict= {0:'Black', 1:'White', 2:'Hisp_Latino',
        3:'Other_plus', 4:'Unknown', 5:'urbn', 6:'rural', 7:'no_urban', 8:'dead', 9:'endo', 
        10:'infertility', 11:'RA', 12:'dysmen', 13:'Irregular', 14:'spinal', 15:'scoliosis',
        16:'hearing_loss', 17:'mc', 18:'EOM', 19:'hemato', 20:'HPV', 21:'HIV', 22:'STI', 
        23:'smoker', 24:'AA', 25:'CVD',26:'meno', 27:'preg', 28:'ectop', 29:'lynch', 30:'PCOS',
        31:'Db2', 32:'any_csect', 33:'MCCLD', 34:'HRP', 35:'FT_loss',
        36:'scaled_BMI', 37:'scaled_age'}


# Optional: Print feature importance with actual feature names
feature_importance = model.stages[-1].featureImportances
print("Original Feature Importance (minus previous zeros):")
for i, imp in enumerate(feature_importance.toArray()):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"{feature_name}: {imp}")

In [None]:
#This analysis will print all of the features in rank order, excluding zeros

feature_dict= {0:'Black', 1:'White', 2:'Hisp_Latino',
        3:'Other_plus', 4:'Unknown', 5:'urbn', 6:'rural', 7:'no_urban', 8:'dead', 9:'endo', 
        10:'infertility', 11:'RA', 12:'dysmen', 13:'Irregular', 14:'spinal', 15:'scoliosis',
        16:'hearing_loss', 17:'mc', 18:'EOM', 19:'hemato', 20:'HPV', 21:'HIV', 22:'STI', 
        23:'smoker', 24:'AA', 25:'CVD',26:'meno', 27:'preg', 28:'ectop', 29:'lynch', 30:'PCOS',
        31:'Db2', 32:'any_csect', 33:'MCCLD', 34:'HRP', 35:'FT_loss',
        36:'scaled_BMI', 37:'scaled_age'}


# Optional: Print feature importance with actual feature names and rank
feature_importance = model.stages[-1].featureImportances
print("Ranked Feature Importance:")
# Filter out features with zero importance
non_zero_importance = [(i, imp) for i, imp in enumerate(feature_importance.toArray()) if imp > 0]
# Sort features by importance in descending order
sorted_features = sorted(non_zero_importance, key=lambda x: x[1], reverse=True)

for rank, (i, imp) in enumerate(sorted_features):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"Rank {rank + 1}: {feature_name} - Importance: {imp}")

## Third Run with df3

In [None]:
##Define the Random Forest model; 500 trees selected based off of article
rf = RandomForestClassifier(labelCol="CUA_ANY", featuresCol="features", numTrees=500)

pipeline = Pipeline(stages=[assembler, rf])

(training_data, testing_data) = random_non_cua3.randomSplit([0.8, 0.2], seed=1234)

# Train the model
model = pipeline.fit(random_non_cua3)

# Make predictions on the testing set
predictions = model.transform(testing_data)

In [None]:
predictions_full3 = model.transform(rdxn)

In [None]:
##Evaluate the model

# Evaluate the model--UPDATE with AUROC
evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"ROC AUC: {auc}")

evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")
#ROC AUC: 0.8464912972426755
#Accuracy: 0.7700682623347447

In [None]:
#This analysis will print all of the features in order and their importance

feature_dict= {0:'Black', 1:'White', 2:'Hisp_Latino',
        3:'Other_plus', 4:'Unknown', 5:'urbn', 6:'rural', 7:'no_urban', 8:'dead', 9:'endo', 
        10:'infertility', 11:'RA', 12:'dysmen', 13:'Irregular', 14:'spinal', 15:'scoliosis',
        16:'hearing_loss', 17:'mc', 18:'EOM', 19:'hemato', 20:'HPV', 21:'HIV', 22:'STI', 
        23:'smoker', 24:'AA', 25:'CVD',26:'meno', 27:'preg', 28:'ectop', 29:'lynch', 30:'PCOS',
        31:'Db2', 32:'any_csect', 33:'MCCLD', 34:'HRP', 35:'FT_loss',
        36:'scaled_BMI', 37:'scaled_age'}


# Optional: Print feature importance with actual feature names
feature_importance = model.stages[-1].featureImportances
print("Original Feature Importance (minus previous zeros):")
for i, imp in enumerate(feature_importance.toArray()):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"{feature_name}: {imp}")

In [None]:
#This analysis will print all of the features in rank order, excluding zeros

feature_dict= {0:'Black', 1:'White', 2:'Hisp_Latino',
        3:'Other_plus', 4:'Unknown', 5:'urbn', 6:'rural', 7:'no_urban', 8:'dead', 9:'endo', 
        10:'infertility', 11:'RA', 12:'dysmen', 13:'Irregular', 14:'spinal', 15:'scoliosis',
        16:'hearing_loss', 17:'mc', 18:'EOM', 19:'hemato', 20:'HPV', 21:'HIV', 22:'STI', 
        23:'smoker', 24:'AA', 25:'CVD',26:'meno', 27:'preg', 28:'ectop', 29:'lynch', 30:'PCOS',
        31:'Db2', 32:'any_csect', 33:'MCCLD', 34:'HRP', 35:'FT_loss',
        36:'scaled_BMI', 37:'scaled_age'}

# Optional: Print feature importance with actual feature names and rank
feature_importance = model.stages[-1].featureImportances
print("Ranked Feature Importance:")
# Filter out features with zero importance
non_zero_importance = [(i, imp) for i, imp in enumerate(feature_importance.toArray()) if imp > 0]
# Sort features by importance in descending order
sorted_features = sorted(non_zero_importance, key=lambda x: x[1], reverse=True)

for rank, (i, imp) in enumerate(sorted_features):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"Rank {rank + 1}: {feature_name} - Importance: {imp}")

## Fourth run with df4

In [None]:
# Define the Random Forest model; 500 trees selected based off of article
rf = RandomForestClassifier(labelCol="CUA_ANY", featuresCol="features", numTrees=500)

##(this can be more complex when you need string indexer and one-hot encoder)
pipeline = Pipeline(stages=[assembler, rf])

(training_data, testing_data) = random_non_cua4.randomSplit([0.8, 0.2], seed=1234)

# Train the model
model = pipeline.fit(random_non_cua3)

# Make predictions on the testing set
predictions = model.transform(testing_data)

In [None]:
##Evaluate the model

# Evaluate the model--UPDATE with AUROC
evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"ROC AUC: {auc}")

evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

#ROC AUC: 0.8424340329770147
#Accuracy: 0.765056597252225

In [None]:
predictions_full4 = model.transform(rdxn)

In [None]:
#This analysis will print all of the features in order and their importance

feature_dict= {0:'Black', 1:'White', 2:'Hisp_Latino',
        3:'Other_plus', 4:'Unknown', 5:'urbn', 6:'rural', 7:'no_urban', 8:'dead', 9:'endo', 
        10:'infertility', 11:'RA', 12:'dysmen', 13:'Irregular', 14:'spinal', 15:'scoliosis',
        16:'hearing_loss', 17:'mc', 18:'EOM', 19:'hemato', 20:'HPV', 21:'HIV', 22:'STI', 
        23:'smoker', 24:'AA', 25:'CVD',26:'meno', 27:'preg', 28:'ectop', 29:'lynch', 30:'PCOS',
        31:'Db2', 32:'any_csect', 33:'MCCLD', 34:'HRP', 35:'FT_loss',
        36:'scaled_BMI', 37:'scaled_age'}


# Optional: Print feature importance with actual feature names
feature_importance = model.stages[-1].featureImportances
print("Original Feature Importance (minus previous zeros):")
for i, imp in enumerate(feature_importance.toArray()):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"{feature_name}: {imp}")
    


In [None]:
#This analysis will print all of the features in rank order, excluding zeros


feature_dict= {0:'Black', 1:'White', 2:'Hisp_Latino',
        3:'Other_plus', 4:'Unknown', 5:'urbn', 6:'rural', 7:'no_urban', 8:'dead', 9:'endo', 
        10:'infertility', 11:'RA', 12:'dysmen', 13:'Irregular', 14:'spinal', 15:'scoliosis',
        16:'hearing_loss', 17:'mc', 18:'EOM', 19:'hemato', 20:'HPV', 21:'HIV', 22:'STI', 
        23:'smoker', 24:'AA', 25:'CVD',26:'meno', 27:'preg', 28:'ectop', 29:'lynch', 30:'PCOS',
        31:'Db2', 32:'any_csect', 33:'MCCLD', 34:'HRP', 35:'FT_loss',
        36:'scaled_BMI', 37:'scaled_age'}

# Optional: Print feature importance with actual feature names and rank
feature_importance = model.stages[-1].featureImportances
print("Ranked Feature Importance:")
# Filter out features with zero importance
non_zero_importance = [(i, imp) for i, imp in enumerate(feature_importance.toArray()) if imp > 0]
# Sort features by importance in descending order
sorted_features = sorted(non_zero_importance, key=lambda x: x[1], reverse=True)

for rank, (i, imp) in enumerate(sorted_features):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"Rank {rank + 1}: {feature_name} - Importance: {imp}")

## Fifth run with df5

In [None]:
##Define the Random Forest model; 500 trees selected based off of article
rf = RandomForestClassifier(labelCol="CUA_ANY", featuresCol="features", numTrees=500)


pipeline = Pipeline(stages=[assembler, rf])

(training_data, testing_data) = random_non_cua5.randomSplit([0.8, 0.2], seed=1234)

# Train the model
model = pipeline.fit(random_non_cua3)

# Make predictions on the testing set
predictions = model.transform(testing_data)

In [None]:
##Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"ROC AUC: {auc}")

evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

#ROC AUC: 0.8479994008089378
#Accuracy: 0.7711915665773784

In [None]:
#This analysis will print all of the features in order and their importance

feature_dict= {0:'Black', 1:'White', 2:'Hisp_Latino',
        3:'Other_plus', 4:'Unknown', 5:'urbn', 6:'rural', 7:'no_urban', 8:'dead', 9:'endo', 
        10:'infertility', 11:'RA', 12:'dysmen', 13:'Irregular', 14:'spinal', 15:'scoliosis',
        16:'hearing_loss', 17:'mc', 18:'EOM', 19:'hemato', 20:'HPV', 21:'HIV', 22:'STI', 
        23:'smoker', 24:'AA', 25:'CVD',26:'meno', 27:'preg', 28:'ectop', 29:'lynch', 30:'PCOS',
        31:'Db2', 32:'any_csect', 33:'MCCLD', 34:'HRP', 35:'FT_loss',
        36:'scaled_BMI', 37:'scaled_age'}


# Optional: Print feature importance with actual feature names
feature_importance = model.stages[-1].featureImportances
print("Original Feature Importance (minus previous zeros):")
for i, imp in enumerate(feature_importance.toArray()):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"{feature_name}: {imp}")
    

In [None]:
#This analysis will print all of the features in rank order, excluding zeros

feature_dict= {0:'Black', 1:'White', 2:'Hisp_Latino',
        3:'Other_plus', 4:'Unknown', 5:'urbn', 6:'rural', 7:'no_urban', 8:'dead', 9:'endo', 
        10:'infertility', 11:'RA', 12:'dysmen', 13:'Irregular', 14:'spinal', 15:'scoliosis',
        16:'hearing_loss', 17:'mc', 18:'EOM', 19:'hemato', 20:'HPV', 21:'HIV', 22:'STI', 
        23:'smoker', 24:'AA', 25:'CVD',26:'meno', 27:'preg', 28:'ectop', 29:'lynch', 30:'PCOS',
        31:'Db2', 32:'any_csect', 33:'MCCLD', 34:'HRP', 35:'FT_loss',
        36:'scaled_BMI', 37:'scaled_age'}

# Optional: Print feature importance with actual feature names and rank
feature_importance = model.stages[-1].featureImportances
print("Ranked Feature Importance:")
# Filter out features with zero importance
non_zero_importance = [(i, imp) for i, imp in enumerate(feature_importance.toArray()) if imp > 0]
# Sort features by importance in descending order
sorted_features = sorted(non_zero_importance, key=lambda x: x[1], reverse=True)

for rank, (i, imp) in enumerate(sorted_features):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"Rank {rank + 1}: {feature_name} - Importance: {imp}")

## Explore PSA distribution

### First model

In [None]:
predictions_CUA1 = predictions_full1.filter(predictions_full1['CUA_ANY'] == 1)  # Filter treatment group
predictions_non1=predictions_full1.filter(predictions_full1['CUA_ANY'] == 0)  # Filter treatment group

pred_sample_CUA1=predictions_CUA1.sample(fraction=0.1)
pre_sample_non1=predictions_non1.sample(fraction=0.01)
sample_concat1=pred_sample_CUA1.union(pre_sample_non1)
sample_pdf1=sample_concat1.toPandas()

In [None]:
sample_pdf1['prob'] = sample_pdf1['probability'].apply(lambda x: x[1])
sample_pdf1.hist(column='prob', by='CUA_ANY', bins=10)

In [None]:
grouped_description1 = sample_pdf1.groupby("CUA_ANY")["prob"].describe()
print(grouped_description1)

### Second model

In [None]:
predictions_CUA2 = predictions_full2.filter(predictions_full2['CUA_ANY'] == 1)  # Filter treatment group
predictions_non2=predictions_full2.filter(predictions_full2['CUA_ANY'] == 0)  # Filter treatment group

pred_sample_CUA2=predictions_CUA2.sample(fraction=0.1)
pre_sample_non2=predictions_non2.sample(fraction=0.01)
sample_concat2=pred_sample_CUA2.union(pre_sample_non2)
sample_pdf2=sample_concat2.toPandas()

In [None]:
sample_pdf2['prob'] = sample_pdf2['probability'].apply(lambda x: x[1])
sample_pdf2.hist(column='prob', by='CUA_ANY', bins=10)

In [None]:
grouped_description2 = sample_pdf2.groupby("CUA_ANY")["prob"].describe()
print(grouped_description2)

#### Third model

In [None]:
predictions_CUA3 = predictions_full3.filter(predictions_full3['CUA_ANY'] == 1)  # Filter treatment group
predictions_non3=predictions_full3.filter(predictions_full3['CUA_ANY'] == 0)  # Filter treatment group

pred_sample_CUA3=predictions_CUA3.sample(fraction=0.1)
pre_sample_non3=predictions_non3.sample(fraction=0.01)
sample_concat3=pred_sample_CUA3.union(pre_sample_non3)
sample_pdf3=sample_concat3.toPandas()

In [None]:
sample_pdf3['prob'] = sample_pdf3['probability'].apply(lambda x: x[1])
sample_pdf3.hist(column='prob', by='CUA_ANY', bins=10)

In [None]:
grouped_description3 = sample_pdf3.groupby("CUA_ANY")["prob"].describe()
print(grouped_description3)

#### Fourth model

In [None]:
predictions_CUA4 = predictions_full4.filter(predictions_full4['CUA_ANY'] == 1)  # Filter treatment group
predictions_non4=predictions_full4.filter(predictions_full4['CUA_ANY'] == 0)  # Filter treatment group

pred_sample_CUA4=predictions_CUA4.sample(fraction=0.1)
pre_sample_non4=predictions_non4.sample(fraction=0.01)
sample_concat4=pred_sample_CUA4.union(pre_sample_non4)
sample_pdf4=sample_concat4.toPandas()

In [None]:
sample_pdf4['prob'] = sample_pdf4['probability'].apply(lambda x: x[1])
sample_pdf4.hist(column='prob', by='CUA_ANY', bins=10)

In [None]:
grouped_description4 = sample_pdf4.groupby("CUA_ANY")["prob"].describe()
print(grouped_description4)

#### Fifth model

In [None]:
predictions_CUA5 = predictions_full5.filter(predictions_full5['CUA_ANY'] == 1)  # Filter treatment group
predictions_non5=predictions_full5.filter(predictions_full5['CUA_ANY'] == 0)  # Filter treatment group

pred_sample_CUA5=predictions_CUA5.sample(fraction=0.1)
pre_sample_non5=predictions_non5.sample(fraction=0.01)
sample_concat5=pred_sample_CUA2.union(pre_sample_non5)
sample_pdf5=sample_concat5.toPandas()

In [None]:
sample_pdf5['prob'] = sample_pdf5['probability'].apply(lambda x: x[1])
sample_pdf5.hist(column='prob', by='CUA_ANY', bins=10)

In [None]:
grouped_description5 = sample_pdf5.groupby("CUA_ANY")["prob"].describe()
print(grouped_description5)

## Save Full dataframe with PSA for matching
#### Noted simlarities between all models and PSA distributions; Chose Model 1 due to highest AUC ROC

In [None]:
# Define a UDF to extract the first element from the vector
extract_prob_udf = udf(lambda v: float(v[0]), DoubleType())

# Create a new column 'prob0' using the UDF
predictions_extracted = predictions_full1.withColumn('prob0', extract_prob_udf(col('probability')))

In [None]:
predictions_extracted.limit(5).toPandas()

In [None]:
# Define a UDF to extract the second element from the vector
extract_prob_udf2 = udf(lambda v: float(v[1]), DoubleType())

# Create a new column 'prob1' using the UDF
predictions_extracted2 = predictions_extracted.withColumn('prob1', extract_prob_udf2(col('probability')))

In [None]:
predictions_extracted2.limit(5).toPandas()

In [None]:
# List of column names to select
selected_columns = ['personid', 'CUA_ANY', 'age', 'prob0', 'prob1']

# Select the specified columns
simplified_probs = predictions_extracted2.select(*selected_columns)

In [None]:
simplified_probs.limit(5).toPandas()

In [None]:
# Calculate the logit
logit_df = simplified_probs.withColumn("logit_ps", expr("1 / (1 + exp(-log(prob1)))"))

# Show the result
logit_df.limit(5).toPandas()

In [None]:
logit_df.write.saveAsTable('CUA_db.cua_non_age_PSM_update')

In [None]:
psm= spark.sql("""
    SELECT *
    FROM cua_non_age_PSM
""")
psm

In [None]:
psm_CUA = logit_df.filter(logit_df['CUA_ANY'] == 1)  # Filter treatment group
psm_non=logit_df.filter(logit_df['CUA_ANY'] == 0)  # Filter treatment group

In [None]:
psm_CUA.write.saveAsTable('CUA_db.psm_CUA')

In [None]:
psm_non.write.saveAsTable('CUA_db.psm_non')