In [78]:
from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col
import json


In [79]:
import pandas as pd

In [80]:
# change ic2020mission.csv column name into uppercase
ic2020mission_df = pd.read_csv("./data/v2_additional_data/IC2020Mission.csv")
ic2020mission_df = ic2020mission_df.rename(columns={"unitid":"UNITID", "missionURL": "MISSIONURL", "mission": "MISSION"})
ic2020mission_df.to_csv("./data/v2_additional_data/IC2020Mission_updated.csv")

In [81]:
# Initialize SparkContext
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [82]:
# import dataset
df_filename = './data/college-search-data.parquet'
df = spark.read.load(df_filename)

In [83]:
print((df.count(), len(df.columns)))

(6694, 446)


In [84]:
datatypes_route = "./assets/datatypes.json"
v2_additional_route = "./data/v2_additional_data/"

## Datatypes for each additional dataset

105 - 2 columns overall

In [85]:
# 10 columns
hd2020_columns = {
    "ADDR": "string",
    "GENTELE": "string",
    "INSTCAT": "integer",
    "LANDGRNT": "integer",
    "C18IPUG": "integer",
    "C18UGPRF": "integer",
    "C18ENPRF": "integer",
    "C18SZSET": "integer",
    "IALIAS": "string",
    "INSTSIZE": "integer"
}

# 12 columns - 2 columns
ic2020_columns = {
    # "RELAFFIL": "integer",
    # "OPENADMP": "integer",
    "SLO5": "integer",
    "ASSOC1": "integer",
    "SPORT1": "integer",
    "SPORT2": "integer",
    "SPORT3": "integer",
    "SPORT4": "integer",
    "CALSYS": "integer",
    "APPLFEEU": "integer",
    "FT_UG": "integer",
    "RMBRDAMT": "integer"
}

# 11 columns
adm2020_columns = {
    "ADMCON1": "integer",
    "ADMCON2": "integer",
    "ADMCON3": "integer",
    "ADMCON4": "integer",
    "ADMCON5": "integer",
    "ADMCON6": "integer",
    "ADMCON7": "integer",
    "SATPCT": "integer",
    "ACTPCT": "integer",
    "ENRLM": "integer",
    "ENRLW": "integer"
}

# 5 columns
drvadm2020_columns = {
    "DVADM02": "integer",
    "DVADM03": "integer",
    "DVADM08": "integer",
    "DVADM09": "integer",
    "DVADM04": "integer"
}

# 1 column
ic2020mission_columns = {
    "MISSION": "string"
}

# 5 columns
drvic2020_columns = {
    "CINSON": "integer",
    "COTSON": "integer",
    "CINSOFF": "integer",
    "COTSOFF": "integer",
    "TUFEYR3": "integer",
}

# 2 columns
ic2020_ay_columns = {
    "TUITION2": "integer",
    "TUITION3": "integer"
}

# 1 column
ef2020a_columns = {
    "EFNRALT": "integer"
}

# 2 columns
ef2020b_columns = {
    "EFAGE07": "integer",
    "EFAGE08": "integer"
}

# 13 columns
drvef2020_columns = {
    "ENRTOT": "integer",
    "EFUG": "integer",
    "EFGRAD": "integer",
    "RMOUSTTP": "integer",
    "RMINSTTP": "integer",
    "RMUNKNWP": "integer",
    "PCTENRWH": "integer",
    "PCTENRBK": "integer",
    "PCTENRHS": "integer",
    "PCTENRAP": "integer",
    "PCTENRAN": "integer",
    "PCTENRUN": "integer",
    "PCTENRNR": "integer",
}

# 1 column
ef2020d_columns = {
    "STUFACR": "integer"
}

# new dataset:
# 3 columns
sal2020_is_columns = {
    "SAINSTT": "integer",
    "SAINSTW": "integer",
    "SAINSTM": "integer",
}

# 1 column
sal2020_nis_columns = {
    "SANIN02": "integer",
}

# 6 columns
f1920_f2_columns = {
    "F2C01": "integer",
    "F2C02": "integer",
    "F2C03": "integer",
    "F2C04": "integer",
    "F2C07": "integer",
    "F2E081": "integer",
}

# 3 columns
drvf2020_columns = {
    "F1STSVFT": "integer",
    "F1ACSPFT": "integer",
    "F1OTEXFT": "integer",
}

# 15 columns
sfa1920_p2_columns = {
    "GIS4OF1": "integer",
    "GIS4A41": "integer",
    "GIS4T51": "integer",
    "NPT430": "integer",
    "NPT440": "integer",
    "NPT450": "integer",
    "GRN4G11": "integer",
    "GRN4G21": "integer",
    "GRN4G31": "integer",
    "GIS4A12": "integer",
    "GIS4A22": "integer",
    "GIS4A32": "integer",
    "GIS4A42": "integer",
    "GIS4A52": "integer",
    "NPIST2": "integer",
}

# 2 columns
gr200_20_columns = {
    "BAGR100": "integer",
    "BAGR150": "integer",
}

# 2 columns
gr2020_pell_ssl_columns = {
    "NRCMOBA": "integer",
    "NRCMTOT": "integer",
}

# 10 columns
sfa1920_p1_columns = {
    "SCFA2": "integer",
    "ANYAIDP": "integer",
    "PGRNT_P": "integer",
    "OFGRT_P": "integer",
    "FLOAN_P": "integer",
    "OLOAN_P": "integer",
    "UAGRNTP": "integer",
    "UPGRNTP": "integer",
    "UFLOANP": "integer",
    "AGRNT_A": "integer",
}

# 105 - 2 columns overall

## Preprocessing Function

In [86]:
def concatenate_new_data(datatype_dict, dataset_name, df):
    colNames = list(datatype_dict.keys())
    with open(datatypes_route) as f:
        cur_datatypes = json.load(f)

    # print(len(cur_datatypes))

    for colName in colNames:
        cur_datatypes[colName] = datatype_dict[colName]

    # print(len(cur_datatypes))

    with open(datatypes_route, 'w') as f:
        f.write(json.dumps(cur_datatypes, indent=4))
    
    colNames.append("UNITID")

    v2df = spark.read.csv(v2_additional_route + dataset_name, header=True, inferSchema=True)
    v2df = v2df.withColumn("UNITID", F.col("UNITID").cast("string"))
    v2df = v2df.select(colNames)
    
    df = df.join(v2df, "UNITID", "left")
    print((df.count(), len(df.columns)))
    return df
    

## Merge Dataset

In [87]:
df = concatenate_new_data(hd2020_columns, "HD2020.csv", df)
df = concatenate_new_data(ic2020_columns, "IC2020.csv", df)
df = concatenate_new_data(adm2020_columns, "ADM2020.csv", df)
df = concatenate_new_data(drvadm2020_columns, "DRVADM2020.csv", df)
df = concatenate_new_data(ic2020mission_columns, "IC2020Mission_updated.csv", df)
df = concatenate_new_data(drvic2020_columns, "DRVIC2020.csv", df)
df = concatenate_new_data(ic2020_ay_columns, "IC2020_AY.csv", df)
# df = concatenate_new_data(ef2020a_columns, "EF2020A.csv", df)
# df = concatenate_new_data(ef2020b_columns, "EF2020B.csv", df)
df = concatenate_new_data(drvef2020_columns, "DRVEF2020.csv", df)
# df = concatenate_new_data(ef2020d_columns, "EF2020D.csv", df)
df = concatenate_new_data(sal2020_is_columns, "DRVEF2020.csv", df)
df = concatenate_new_data(sal2020_nis_columns, "DRVEF2020.csv", df)
df = concatenate_new_data(f1920_f2_columns, "DRVEF2020.csv", df)
df = concatenate_new_data(drvf2020_columns, "DRVEF2020.csv", df)
df = concatenate_new_data(sfa1920_p2_columns, "DRVEF2020.csv", df)
df = concatenate_new_data(gr200_20_columns, "DRVEF2020.csv", df)
df = concatenate_new_data(gr2020_pell_ssl_columns, "DRVEF2020.csv", df)
df = concatenate_new_data(sfa1920_p1_columns, "DRVEF2020.csv", df)


(6694, 456)
(6694, 466)
(6694, 477)
(6694, 482)
(6694, 483)
(6694, 487)
(6694, 489)
(6694, 492)


In [70]:
df.select("UNITID").distinct().count()

6694

In [71]:
df = concatenate_new_data(ef2020b_columns, "EF2020B.csv", df)

(81498, 491)


In [76]:
df.select("EFAGE07", "EFAGE08", "UNITID").distinct().count()

58277

## Add College Fit Metric

In [None]:
import pandas as pd

# use pandas since pyspark dataframe is immutable for specific data

In [None]:
def target_tier(adm_rate):
    if adm_rate < 0.1:
        return 13
    elif adm_rate < 0.14:
        return 10
    elif adm_rate < 0.17:
        return 9
    elif adm_rate < 0.21:
        return 8
    elif adm_rate < 0.26:
        return 7
    elif adm_rate < 0.33:
        return 6
    elif adm_rate < 0.42:
        return 5
    elif adm_rate < 0.55:
        return 4
    elif adm_rate < 0.75:
        return 3
    elif adm_rate < 0.85:
        return 2
    elif adm_rate < 0.90:
        return 1
    else:
        return 0

def safety_tier(adm_rate):
    if adm_rate < 0.1:
        return 13
    elif adm_rate < 0.14:
        return 11
    elif adm_rate < 0.18:
        return 10
    elif adm_rate < 0.23:
        return 9
    elif adm_rate < 0.31:
        return 8
    elif adm_rate < 0.45:
        return 7
    elif adm_rate < 0.6:
        return 6
    elif adm_rate < 0.75:
        return 5
    elif adm_rate < 0.9:
        return 4
    elif adm_rate < 0.95:
        return 3
    else:
        return 0

## Replace Null Values

In [56]:
df = df.replace({'NULL': None, 'null': None})

## Test

## Wrap Up

In [77]:
sc.stop()