In [None]:
spark

In [None]:
# set max columns, rows, column width in pandas so doesn't truncate
import pandas as pd
pd.set_option('display.max_colwidth',250) # or -1
pd.set_option('display.max_columns', None) # or 500
pd.set_option('display.max_rows', None) # or 500

# sets the cell width to 100% respective to the screen size
from IPython.core.display import display, HTML
from pyspark.sql.functions import when, col
from pyspark.sql.functions import avg
display(HTML("<style>.container { width:92% !important; }</style>"))
from pyspark.sql.functions import col, sum as spark_sum
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.functions import expr
from pyspark.sql.functions import stddev

In [None]:
spark.sql('use CUA_db')

In [None]:
#call in data frame with all features

cua_non= spark.sql("""
    SELECT *
    FROM cua_non_consolidated_cua_non
""")
cua_non

## Obtain Means and Stdev for all features BEFORE matching

In [None]:
#Take means of all features
means=cua_non.groupby('CUA_ANY').mean()

means.toPandas()

In [None]:
#obtain stdev for each feature

from pyspark.sql.functions import col, stddev

# Filter out the 'personid' column from the DataFrame
numerical_columns1 = [col_name for col_name in cua_non.columns if col_name != 'personid']

# Group by the 'CUA_ANY' column
grouped_stdev1 = cua_non.groupBy('CUA_ANY')

# Calculate standard deviation for each column
stdev_matched1 = grouped_stdev1.agg(*(stddev(col(col_name)).alias(f'stddev_{col_name}') for col_name in numerical_columns1))

stdev_matched1.toPandas()

## Obtain Means & Stdev for All Features After Matching

### First need to isolate matched personids from the DF

In [None]:
#call in the saved DF with control and treatments matched by PS, should be 28,462 each
matches= spark.sql("""
    SELECT *
    FROM ps_matches_updated
""")
matches

matches.count()

In [None]:
##Double check for nulls

null_count = matches.filter(col('Control_PersonID').isNull()).count()

# Print the count of null values
print("Number of null or NaN values in column 'column_name':", null_count)

In [None]:
##Obtain a list of all matched personids (control and treatment)

all_personid_list = matches.select('Treatment_PersonID', 'Control_PersonID').collect()

# Extract the 'personid' values from the collected rows
all_personid_list = [row.asDict() for row in all_personid_list]

In [None]:
personid_values = [d['Treatment_PersonID'] for d in all_personid_list] + [d['Control_PersonID'] for d in all_personid_list]

In [None]:
control_personids_list = matches.select('Control_PersonID').rdd.flatMap(lambda x: x).collect()

In [None]:
len(control_personids_list)

In [None]:
personid_values

In [None]:
#Double check for duplicates

if len(personid_values) == len(set(personid_values)):
    print("No duplicates")
else:
    print("Duplicates found")

In [None]:
len(personid_values)/2

In [None]:
matched_set=cua_non.filter(col('personid').isin(personid_values))
matched_set.count()

#56924

In [None]:
##Obtain means for matched group
means_matched=matched_set.groupby('CUA_ANY').mean()

means_matched.toPandas()

In [None]:
#obtain stddev for matched

from pyspark.sql.functions import col, stddev

# Filter out the 'personid' column from the DataFrame
numerical_columns = [col_name for col_name in matched_set.columns if col_name != 'personid']

# Group by the 'CUA_ANY' column
grouped_matched = matched_set.groupBy('CUA_ANY')

# Calculate standard deviation for each column
stdev_matched = grouped_matched.agg(*(stddev(col(col_name)).alias(f'stddev_{col_name}') for col_name in numerical_columns))

stdev_matched.toPandas()

#### Means and Stdevs were extracted and used to calculate effect sizes in excel

## Obtain Cancer Counts for each Group

In [None]:
filter_Any_Gyn = matched_set.filter(matched_set['Any_Gyn'] == 1)

Any_Gyn_count=filter_Any_Gyn.groupby('CUA_ANY').count()
Any_Gyn_count.toPandas()

In [None]:
filter_utc = matched_set.filter(matched_set['UTC'] == 1)

utc_count=filter_utc.groupby('CUA_ANY').count()
utc_count.toPandas()

In [None]:
filter_ovc = matched_set.filter(matched_set['OVC2'] == 1)

ovc_count=filter_ovc.groupby('CUA_ANY').count()
ovc_count.toPandas()

In [None]:
filter_OV_FT= matched_set.filter(matched_set['OV_FT'] == 1)

OV_FT_count=filter_OV_FT.groupby('CUA_ANY').count()
OV_FT_count.toPandas()

In [None]:
filter_cvxc = matched_set.filter(matched_set['CVX'] == 1)

cvxc_count=filter_cvxc.groupby('CUA_ANY').count()
cvxc_count.toPandas()

In [None]:
filter_breastc = matched_set.filter(matched_set['Breastc'] == 1)

breastc_count=filter_breastc.groupby('CUA_ANY').count()
breastc_count.toPandas()