In [None]:
spark

In [None]:
# set max columns, rows, column width in pandas so doesn't truncate
import pandas as pd
pd.set_option('display.max_colwidth',250) # or -1
pd.set_option('display.max_columns', None) # or 500
pd.set_option('display.max_rows', None) # or 500

# sets the cell width to 100% respective to the screen size
from IPython.core.display import display, HTML
from pyspark.sql.functions import when, col
from pyspark.sql.functions import avg
display(HTML("<style>.container { width:92% !important; }</style>"))
from pyspark.sql.functions import col, sum as spark_sum
import matplotlib.pyplot as plt

In [None]:
from pyspark.sql.functions import col, max
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, sum as spark_sum
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, log, exp
from pyspark.sql.functions import rand
from pyspark.sql import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col, expr

In [None]:
spark.sql('use CUA_db')

In [None]:
#call in dataframe

cua_non= spark.sql("""
    SELECT *
    FROM concat_cua_non_table
""")
cua_non

In [None]:
cua_non.limit(10).toPandas()

## Alter numerics in Dataframe

In [None]:
#Altering age columns

# Dividing by 100
scaled_divided = cua_non.withColumn("scaled_age", col("age") / 100)

# Taking logarithm
scaled_log = scaled_divided.withColumn("age_log", log(col("scaled_age")))


# Taking exponentiation
scaled_exp = scaled_log.withColumn("age_exp", exp(col("scaled_age")))


In [None]:
#Altering BMI columns

# Dividing by 40 (although higher BMIs, this is a typical high range)
scaled_BMI = scaled_exp.withColumn("scaled_BMI", col("BMI") / 40)

## Create DF with equal groups to deal with imbalance classification

In [None]:
grouped_CUA = scaled_BMI.filter(scaled_BMI['CUA_ANY'] == 1)  # Filter treatment group
grouped_non=scaled_BMI.filter(scaled_BMI['CUA_ANY'] == 0)  # Filter control group

print(grouped_CUA.count())
print(grouped_non.count())

In [None]:
## order the grouped non and then randomize
ordered_and_randomized_df = grouped_non.orderBy(rand())


In [None]:
# Add a new column named '_index' as an index using row_number
window_spec = Window.orderBy("personid")  # Replace "any_column" with a column that defines the order
df_with_index = ordered_and_randomized_df.withColumn("_index", row_number().over(window_spec))


In [None]:
# Get the total number of rows in the original DataFrame
total_rows = df_with_index.count()

# Number of rows for each random DataFrame
rows_per_dataframe = 28462

start_index_df1 = 0
end_index_df1 = rows_per_dataframe

# Filter the original DataFrame for the first DataFrame
df1 = df_with_index.filter((col("_index") >= start_index_df1) & (col("_index") < end_index_df1))


In [None]:
df1.count()

In [None]:
# Calculate the start and end indices for the second DataFrame
start_index_df2 = end_index_df1
end_index_df2 = start_index_df2 + rows_per_dataframe

# Filter the original DataFrame for the second DataFrame
df2 = df_with_index.filter((col("_index") >= start_index_df2) & (col("_index") < end_index_df2))


In [None]:
# Calculate the start and end indices for the third DataFrame
start_index_df3 = end_index_df2
end_index_df3 = start_index_df3 + rows_per_dataframe

# Filter the original DataFrame for the third DataFrame
df3 = df_with_index.filter((col("_index") >= start_index_df3) & (col("_index") < end_index_df3))


In [None]:
# Calculate the start and end indices for the fourth DataFrame
start_index_df4 = end_index_df3
end_index_df4 = start_index_df4 + rows_per_dataframe

# Filter the original DataFrame for the fourth DataFrame
df4 = df_with_index.filter((col("_index") >= start_index_df4) & (col("_index") < end_index_df4))


In [None]:
# Calculate the start and end indices for the fifth DataFrame
start_index_df5 = end_index_df4
end_index_df5 = start_index_df5 + rows_per_dataframe

# Filter the original DataFrame for the fifth DataFrame
df5 = df_with_index.filter((col("_index") >= start_index_df5) & (col("_index") < end_index_df5))


In [None]:
df1a=df1.drop('_index')
df2a=df2.drop('_index')
df3a=df3.drop('_index')
df4a=df4.drop('_index')
df5a=df5.drop('_index')

In [None]:
## Recombine with the CUA df
random_non_cua1=grouped_CUA.union(df1a)
random_non_cua2=grouped_CUA.union(df2a)
random_non_cua3=grouped_CUA.union(df3a)
random_non_cua4=grouped_CUA.union(df4a)
random_non_cua5=grouped_CUA.union(df5a)

## RF Exploration

In [None]:
binary_cols= ['AIAN', 'NHPI', 'Asian', 'Black', 'White', 'Hisp_Latino',
        'Other', 'Mixed', 'Unknown', 'urbn', 'rural', 'no_urban', 'Metropol',
        'Non_metro', 'no_metro', 'dead', 'R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8',
        'R9', 'RU', 'endo', 'infertility', 'RA', 'dysmen', 'Irregular', 'spinal', 'scoliosis',
        'hearing_loss', 'mc', 'EOM', 'hemato', 'HPV', 'HIV', 'STI', 'smoker', 'AA', 'CVD',
        'meno', 'preg', 'ectop', 'lynch', 'PCOS','Db2', 'csect', 'ccsect', 
        'lcsect', 'MCCLD', 'HRP', 'MENA', 'API_ethn', 'PPROM', 'FT_loss','SA']
numerical_cols=['scaled_BMI', 'scaled_age', 'age_log', 'age_exp' ]


In [None]:
assembler = VectorAssembler(
    inputCols=binary_cols + numerical_cols, 
    outputCol="features"
)

## First Run with df1

In [None]:
#Define the Random Forest model; 500 trees selected based off of article
rf = RandomForestClassifier(labelCol="CUA_ANY", featuresCol="features", numTrees=500)

##(this can be more complex when you need string indexer and one-hot encoder)
pipeline = Pipeline(stages=[assembler, rf])

(training_data, testing_data) = random_non_cua1.randomSplit([0.8, 0.2], seed=1234)

# Train the model
model = pipeline.fit(random_non_cua1)

#Make predictions on the testing set
predictions = model.transform(testing_data)

In [None]:
predictions_full = model.transform(scaled_BMI)

In [None]:
##Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

In [None]:
##Run Feature importance

feature_dict= {0: 'AIAN', 1:'NHPI', 2:'Asian', 3:'Black', 4:'White', 5:'Hisp_Latino',
        6:'Other', 7:'Mixed', 8:'Unknown', 9:'urbn', 10:'rural', 11:'no_urban', 12:'Metropol',
        13:'Non_metro', 14:'no_metro', 15:'dead', 16:'R0', 17:'R1', 18:'R2', 19:'R3', 20:'R4', 
        21:'R5', 22:'R6', 23:'R7', 24:'R8',25:'R9', 26:'RU', 27:'endo', 28:'infertility', 29:'RA', 
        30:'dysmen', 31:'Irregular', 32:'spinal', 33:'scoliosis',34:'hearing_loss', 35:'mc', 
        36:'EOM', 37:'hemato', 38:'HPV', 39:'HIV', 40:'STI', 41:'smoker', 42:'AA', 43:'CVD',
        44:'meno', 45:'preg', 46:'ectop', 47:'lynch', 48:'PCOS',49:'Db2', 50:'csect', 51:'ccsect', 
        52:'lcsect', 53:'MCCLD', 54:'HRP', 55:'MENA', 56:'API_ethn', 57:'PPROM', 58:'FT_loss',
        59:'SA', 60:'scaled_BMI', 61:'scaled_age', 62:'age_log', 63:'age_exp'}

# Optional: Print feature importance with actual feature names
feature_importance = model.stages[-1].featureImportances
print("Original Feature Importance (minus previous zeros):")
for i, imp in enumerate(feature_importance.toArray()):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"{feature_name}: {imp}")

In [None]:
##Obtain Rank of features by importance

feature_dict= {0: 'AIAN', 1:'NHPI', 2:'Asian', 3:'Black', 4:'White', 5:'Hisp_Latino',
        6:'Other', 7:'Mixed', 8:'Unknown', 9:'urbn', 10:'rural', 11:'no_urban', 12:'Metropol',
        13:'Non_metro', 14:'no_metro', 15:'dead', 16:'R0', 17:'R1', 18:'R2', 19:'R3', 20:'R4', 
        21:'R5', 22:'R6', 23:'R7', 24:'R8',25:'R9', 26:'RU', 27:'endo', 28:'infertility', 29:'RA', 
        30:'dysmen', 31:'Irregular', 32:'spinal', 33:'scoliosis',34:'hearing_loss', 35:'mc', 
        36:'EOM', 37:'hemato', 38:'HPV', 39:'HIV', 40:'STI', 41:'smoker', 42:'AA', 43:'CVD',
        44:'meno', 45:'preg', 46:'ectop', 47:'lynch', 48:'PCOS',49:'Db2', 50:'csect', 51:'ccsect', 
        52:'lcsect', 53:'MCCLD', 54:'HRP', 55:'MENA', 56:'API_ethn', 57:'PPROM', 58:'FT_loss',
        59:'SA', 60:'scaled_BMI', 61:'scaled_age', 62:'age_log', 63:'age_exp'}

# Optional: Print feature importance with actual feature names and rank
feature_importance = model.stages[-1].featureImportances
print("Ranked Feature Importance:")
# Filter out features with zero importance
non_zero_importance = [(i, imp) for i, imp in enumerate(feature_importance.toArray()) if imp > 0]
# Sort features by importance in descending order
sorted_features = sorted(non_zero_importance, key=lambda x: x[1], reverse=True)

for rank, (i, imp) in enumerate(sorted_features):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"Rank {rank + 1}: {feature_name} - Importance: {imp}")

## Second run with df2

In [None]:
##Define the Random Forest model; 500 trees selected based off of article
rf = RandomForestClassifier(labelCol="CUA_ANY", featuresCol="features", numTrees=500)


pipeline = Pipeline(stages=[assembler, rf])

(training_data, testing_data) = random_non_cua2.randomSplit([0.8, 0.2], seed=1234)

# Train the model
model = pipeline.fit(random_non_cua2)

# Make predictions on the testing set
predictions = model.transform(testing_data)

In [None]:
##Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

In [None]:
##Feature importance

feature_dict= {0: 'AIAN', 1:'NHPI', 2:'Asian', 3:'Black', 4:'White', 5:'Hisp_Latino',
        6:'Other', 7:'Mixed', 8:'Unknown', 9:'urbn', 10:'rural', 11:'no_urban', 12:'Metropol',
        13:'Non_metro', 14:'no_metro', 15:'dead', 16:'R0', 17:'R1', 18:'R2', 19:'R3', 20:'R4', 
        21:'R5', 22:'R6', 23:'R7', 24:'R8',25:'R9', 26:'RU', 27:'endo', 28:'infertility', 29:'RA', 
        30:'dysmen', 31:'Irregular', 32:'spinal', 33:'scoliosis',34:'hearing_loss', 35:'mc', 
        36:'EOM', 37:'hemato', 38:'HPV', 39:'HIV', 40:'STI', 41:'smoker', 42:'AA', 43:'CVD',
        44:'meno', 45:'preg', 46:'ectop', 47:'lynch', 48:'PCOS',49:'Db2', 50:'csect', 51:'ccsect', 
        52:'lcsect', 53:'MCCLD', 54:'HRP', 55:'MENA', 56:'API_ethn', 57:'PPROM', 58:'FT_loss',
        59:'SA', 60:'scaled_BMI', 61:'scaled_age', 62:'age_log', 63:'age_exp'}

# Optional: Print feature importance with actual feature names
feature_importance = model.stages[-1].featureImportances
print("Original Feature Importance (minus previous zeros):")
for i, imp in enumerate(feature_importance.toArray()):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"{feature_name}: {imp}")


In [None]:
##Ranked feature importance
feature_dict= {0: 'AIAN', 1:'NHPI', 2:'Asian', 3:'Black', 4:'White', 5:'Hisp_Latino',
        6:'Other', 7:'Mixed', 8:'Unknown', 9:'urbn', 10:'rural', 11:'no_urban', 12:'Metropol',
        13:'Non_metro', 14:'no_metro', 15:'dead', 16:'R0', 17:'R1', 18:'R2', 19:'R3', 20:'R4', 
        21:'R5', 22:'R6', 23:'R7', 24:'R8',25:'R9', 26:'RU', 27:'endo', 28:'infertility', 29:'RA', 
        30:'dysmen', 31:'Irregular', 32:'spinal', 33:'scoliosis',34:'hearing_loss', 35:'mc', 
        36:'EOM', 37:'hemato', 38:'HPV', 39:'HIV', 40:'STI', 41:'smoker', 42:'AA', 43:'CVD',
        44:'meno', 45:'preg', 46:'ectop', 47:'lynch', 48:'PCOS',49:'Db2', 50:'csect', 51:'ccsect', 
        52:'lcsect', 53:'MCCLD', 54:'HRP', 55:'MENA', 56:'API_ethn', 57:'PPROM', 58:'FT_loss',
        59:'SA', 60:'scaled_BMI', 61:'scaled_age', 62:'age_log', 63:'age_exp'}

# Optional: Print feature importance with actual feature names and rank
feature_importance = model.stages[-1].featureImportances
print("Ranked Feature Importance:")
# Filter out features with zero importance
non_zero_importance = [(i, imp) for i, imp in enumerate(feature_importance.toArray()) if imp > 0]
# Sort features by importance in descending order
sorted_features = sorted(non_zero_importance, key=lambda x: x[1], reverse=True)

for rank, (i, imp) in enumerate(sorted_features):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"Rank {rank + 1}: {feature_name} - Importance: {imp}")

## Third Run with df3

In [None]:
##Define the Random Forest model; 500 trees selected based off of article
rf = RandomForestClassifier(labelCol="CUA_ANY", featuresCol="features", numTrees=500)

pipeline = Pipeline(stages=[assembler, rf])

(training_data, testing_data) = random_non_cua3.randomSplit([0.8, 0.2], seed=1234)

# Train the model
model = pipeline.fit(random_non_cua3)

# Make predictions on the testing set
predictions = model.transform(testing_data)

In [None]:
##Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

In [None]:
##Feature importance rank
feature_dict= {0: 'AIAN', 1:'NHPI', 2:'Asian', 3:'Black', 4:'White', 5:'Hisp_Latino',
        6:'Other', 7:'Mixed', 8:'Unknown', 9:'urbn', 10:'rural', 11:'no_urban', 12:'Metropol',
        13:'Non_metro', 14:'no_metro', 15:'dead', 16:'R0', 17:'R1', 18:'R2', 19:'R3', 20:'R4', 
        21:'R5', 22:'R6', 23:'R7', 24:'R8',25:'R9', 26:'RU', 27:'endo', 28:'infertility', 29:'RA', 
        30:'dysmen', 31:'Irregular', 32:'spinal', 33:'scoliosis',34:'hearing_loss', 35:'mc', 
        36:'EOM', 37:'hemato', 38:'HPV', 39:'HIV', 40:'STI', 41:'smoker', 42:'AA', 43:'CVD',
        44:'meno', 45:'preg', 46:'ectop', 47:'lynch', 48:'PCOS',49:'Db2', 50:'csect', 51:'ccsect', 
        52:'lcsect', 53:'MCCLD', 54:'HRP', 55:'MENA', 56:'API_ethn', 57:'PPROM', 58:'FT_loss',
        59:'SA', 60:'scaled_BMI', 61:'scaled_age', 62:'age_log', 63:'age_exp'}

# Optional: Print feature importance with actual feature names and rank
feature_importance = model.stages[-1].featureImportances
print("Ranked Feature Importance:")
# Filter out features with zero importance
non_zero_importance = [(i, imp) for i, imp in enumerate(feature_importance.toArray()) if imp > 0]
# Sort features by importance in descending order
sorted_features = sorted(non_zero_importance, key=lambda x: x[1], reverse=True)

for rank, (i, imp) in enumerate(sorted_features):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"Rank {rank + 1}: {feature_name} - Importance: {imp}")

## Fourth run with df4

In [None]:
# Define the Random Forest model; 500 trees selected based off of article
rf = RandomForestClassifier(labelCol="CUA_ANY", featuresCol="features", numTrees=500)

##(this can be more complex when you need string indexer and one-hot encoder)
pipeline = Pipeline(stages=[assembler, rf])

(training_data, testing_data) = random_non_cua4.randomSplit([0.8, 0.2], seed=1234)

# Train the model
model = pipeline.fit(random_non_cua3)

# Make predictions on the testing set
predictions = model.transform(testing_data)

In [None]:
##Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

In [None]:
##Feature importance rank

feature_dict= {0: 'AIAN', 1:'NHPI', 2:'Asian', 3:'Black', 4:'White', 5:'Hisp_Latino',
        6:'Other', 7:'Mixed', 8:'Unknown', 9:'urbn', 10:'rural', 11:'no_urban', 12:'Metropol',
        13:'Non_metro', 14:'no_metro', 15:'dead', 16:'R0', 17:'R1', 18:'R2', 19:'R3', 20:'R4', 
        21:'R5', 22:'R6', 23:'R7', 24:'R8',25:'R9', 26:'RU', 27:'endo', 28:'infertility', 29:'RA', 
        30:'dysmen', 31:'Irregular', 32:'spinal', 33:'scoliosis',34:'hearing_loss', 35:'mc', 
        36:'EOM', 37:'hemato', 38:'HPV', 39:'HIV', 40:'STI', 41:'smoker', 42:'AA', 43:'CVD',
        44:'meno', 45:'preg', 46:'ectop', 47:'lynch', 48:'PCOS',49:'Db2', 50:'csect', 51:'ccsect', 
        52:'lcsect', 53:'MCCLD', 54:'HRP', 55:'MENA', 56:'API_ethn', 57:'PPROM', 58:'FT_loss',
        59:'SA', 60:'scaled_BMI', 61:'scaled_age', 62:'age_log', 63:'age_exp'}

# Optional: Print feature importance with actual feature names and rank
feature_importance = model.stages[-1].featureImportances
print("Ranked Feature Importance:")
# Filter out features with zero importance
non_zero_importance = [(i, imp) for i, imp in enumerate(feature_importance.toArray()) if imp > 0]
# Sort features by importance in descending order
sorted_features = sorted(non_zero_importance, key=lambda x: x[1], reverse=True)

for rank, (i, imp) in enumerate(sorted_features):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"Rank {rank + 1}: {feature_name} - Importance: {imp}")

## Fifth run with df5

In [None]:
##Define the Random Forest model; 500 trees selected based off of article
rf = RandomForestClassifier(labelCol="CUA_ANY", featuresCol="features", numTrees=500)


pipeline = Pipeline(stages=[assembler, rf])

(training_data, testing_data) = random_non_cua5.randomSplit([0.8, 0.2], seed=1234)

# Train the model
model = pipeline.fit(random_non_cua3)

# Make predictions on the testing set
predictions = model.transform(testing_data)

In [None]:
##Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="CUA_ANY")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

In [None]:
##Feature importance rank
feature_dict= {0: 'AIAN', 1:'NHPI', 2:'Asian', 3:'Black', 4:'White', 5:'Hisp_Latino',
        6:'Other', 7:'Mixed', 8:'Unknown', 9:'urbn', 10:'rural', 11:'no_urban', 12:'Metropol',
        13:'Non_metro', 14:'no_metro', 15:'dead', 16:'R0', 17:'R1', 18:'R2', 19:'R3', 20:'R4', 
        21:'R5', 22:'R6', 23:'R7', 24:'R8',25:'R9', 26:'RU', 27:'endo', 28:'infertility', 29:'RA', 
        30:'dysmen', 31:'Irregular', 32:'spinal', 33:'scoliosis',34:'hearing_loss', 35:'mc', 
        36:'EOM', 37:'hemato', 38:'HPV', 39:'HIV', 40:'STI', 41:'smoker', 42:'AA', 43:'CVD',
        44:'meno', 45:'preg', 46:'ectop', 47:'lynch', 48:'PCOS',49:'Db2', 50:'csect', 51:'ccsect', 
        52:'lcsect', 53:'MCCLD', 54:'HRP', 55:'MENA', 56:'API_ethn', 57:'PPROM', 58:'FT_loss',
        59:'SA', 60:'scaled_BMI', 61:'scaled_age', 62:'age_log', 63:'age_exp'}

# Optional: Print feature importance with actual feature names and rank
feature_importance = model.stages[-1].featureImportances
print("Ranked Feature Importance:")
# Filter out features with zero importance
non_zero_importance = [(i, imp) for i, imp in enumerate(feature_importance.toArray()) if imp > 0]
# Sort features by importance in descending order
sorted_features = sorted(non_zero_importance, key=lambda x: x[1], reverse=True)

for rank, (i, imp) in enumerate(sorted_features):
    feature_name = feature_dict.get(i, f"Feature {i + 1}")
    print(f"Rank {rank + 1}: {feature_name} - Importance: {imp}")