In [None]:
spark

In [None]:
# set max columns, rows, column width in pandas so doesn't truncate
import pandas as pd
pd.set_option('display.max_colwidth',250) # or -1
pd.set_option('display.max_columns', None) # or 500
pd.set_option('display.max_rows', None) # or 500

# sets the cell width to 100% respective to the screen size
from IPython.core.display import display, HTML
from pyspark.sql.functions import when, col
from pyspark.sql.functions import col
from pyspark.sql.functions import sum as spark_sum
from pyspark.sql.functions import coalesce
display(HTML("<style>.container { width:92% !important; }</style>"))

## Explore data

In [None]:
spark.sql('use CUA_db')

In [None]:
cua_non = spark.sql("""
    select *
    from concat_cua_non_table
    """)
cua_non

In [None]:
filter_CUA = cua_non.filter(cua_non['CUA_ANY'] == 1)  # Filter treatment group
filter_non=cua_non.filter(cua_non['CUA_ANY'] == 0)  # Filter control group

In [None]:
print(filter_CUA.count())
print(filter_non.count())

##28,462
##23,805,007

In [None]:
AIAN_cua=filter_CUA.filter(filter_CUA['AIAN']==1)
AIAN_non=filter_non.filter(filter_non['AIAN']==1)
print(AIAN_cua.count())
print(AIAN_non.count())

#775
#628920

In [None]:
Black_cua=filter_CUA.filter(filter_CUA['Black']==1)
Black_non=filter_non.filter(filter_non['Black']==1)
print(Black_cua.count())
print(Black_non.count())

#1411
#2425515

In [None]:
White_cua=filter_CUA.filter(filter_CUA['White']==1)
White_non=filter_non.filter(filter_non['White']==1)
print(White_cua.count())
print(White_non.count())

#20459
#15559239

In [None]:
HL_cua=filter_CUA.filter(filter_CUA['Hisp_latino']==1)
HL_non=filter_non.filter(filter_non['Hisp_latino']==1)
print(HL_cua.count())
print(HL_non.count())

#5465
#4119444

In [None]:
Unknown_cua=filter_CUA.filter(filter_CUA['Unknown']==1)
Unknown_non=filter_non.filter(filter_non['Unknown']==1)
print(Unknown_cua.count())
print(Unknown_non.count())

#2760
#1757264

In [None]:
NHPI_cua=filter_CUA.filter(filter_CUA['NHPI']==1)
NHPI_non=filter_non.filter(filter_non['NHPI']==1)
print(NHPI_cua.count())
print(NHPI_non.count())

#106
#51419

In [None]:
MENA_cua=filter_CUA.filter(filter_CUA['MENA']==1)
MENA_non=filter_non.filter(filter_non['MENA']==1)
print(MENA_cua.count())
print(MENA_non.count())

#4
#3251

In [None]:
API_cua=filter_CUA.filter(filter_CUA['API_ethn']==1)
API_non=filter_non.filter(filter_non['API_ethn']==1)
print(API_cua.count())
print(API_non.count())

#2
#3962

In [None]:
mixed_cua=filter_CUA.filter(filter_CUA['mixed']==1)
mixed_non=filter_non.filter(filter_non['mixed']==1)
print(mixed_cua.count())
print(mixed_non.count())

#177
#129424

In [None]:
metro_cua=filter_CUA.filter(filter_CUA['Metropol']==1)
metro_non=filter_non.filter(filter_non['Metropol']==1)
print(metro_cua.count())
print(metro_non.count())

#23220
#18974699

In [None]:
nmetro_cua=filter_CUA.filter(filter_CUA['no_metro']==1)
nmetro_non=filter_non.filter(filter_non['no_metro']==1)
print(nmetro_cua.count())
print(nmetro_non.count())

#1012
#1409764

In [None]:
urban_cua=filter_CUA.filter(filter_CUA['urbn']==1)
urban_non=filter_non.filter(filter_non['urbn']==1)
print(urban_cua.count())
print(urban_non.count())

#22910
#18750236

In [None]:
rural_cua=filter_CUA.filter(filter_CUA['rural']==1)
rural_non=filter_non.filter(filter_non['rural']==1)
print(rural_cua.count())
print(rural_non.count())

#4539
#3645962

In [None]:
nm_cua=filter_CUA.filter(filter_CUA['no_metro']==1)
nm_non=filter_non.filter(filter_non['no_metro']==1)
print(nm_cua.count())
print(nm_non.count())

#1012
#1409764

In [None]:
nu_cua=filter_CUA.filter(filter_CUA['no_urban']==1)
nu_non=filter_non.filter(filter_non['no_urban']==1)
print(nu_cua.count())
print(nu_non.count())

#1013
#1408479

## Merge/ coalesce

In [None]:
merged_csect = cua_non.withColumn('any_csect', 
                                  when((cua_non['csect'] == 1) | (cua_non['lcsect'] == 1) | (cua_non['ccsect'] == 1), 1)
                                  .otherwise(0))

In [None]:
filter_CUA1 = merged_csect.filter(merged_csect['CUA_ANY'] == 1)  # Filter treatment group
filter_non1=merged_csect.filter(merged_csect['CUA_ANY'] == 0)  # Filter control group

In [None]:
any_csects_cua=filter_CUA1.filter(filter_CUA1['any_csect']==1)
any_csects_non=filter_non1.filter(filter_non1['any_csect']==1)
print(any_csects_cua.count())
print(any_csects_non.count())

#7321
#876829

In [None]:
merged_races = merged_csect.withColumn('Other_plus', 
                                  when((cua_non['mixed'] == 1) | (cua_non['API_ethn'] == 1) | (cua_non['MENA'] == 1) |
                                        (cua_non['AIAN'] == 1) | (cua_non['NHPI'] == 1) | (cua_non['Asian'] == 1)
                                       | (cua_non['Other'] == 1),1)
                                  .otherwise(0))

In [None]:
merged_races.limit(5).toPandas()

In [None]:
filter_CUA2 = merged_races.filter(merged_races['CUA_ANY'] == 1)  # Filter treatment group
filter_non2=merged_races.filter(merged_races['CUA_ANY'] == 0) 

In [None]:
any_race_cua=filter_CUA2.filter(filter_CUA2['Other_plus']==1)
any_race_non=filter_non2.filter(filter_non2['Other_plus']==1)
print(any_race_cua.count())
print(any_race_non.count())

#3993
#2789225

In [None]:
merged_races.write.saveAsTable('CUA_db.consolidated_cua_non')