In [11]:
from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col
import json


In [21]:
import pandas as pd

In [22]:
# change ic2020mission.csv column name into uppercase
ic2020mission_df = pd.read_csv("./data/v2_additional_data/IC2020Mission.csv")
ic2020mission_df = ic2020mission_df.rename(columns={"unitid":"UNITID", "missionURL": "MISSIONURL", "mission": "MISSION"})
ic2020mission_df.to_csv("./data/v2_additional_data/IC2020Mission_updated.csv")

In [2]:
# Initialize SparkContext
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [68]:
# import dataset
df_filename = './data/college-search-data.parquet'
df = spark.read.load(df_filename)

In [54]:
print((df.count(), len(df.columns)))

(6694, 446)


In [52]:
datatypes_route = "./assets/datatypes.json"
v2_additional_route = "./data/v2_additional_data/"

## Datatypes for each additional dataset

In [38]:
# 10 columns
hd2020_columns = {
    "ADDR": "string",
    "GENTELE": "string",
    "INSTCAT": "integer",
    "LANDGRNT": "integer",
    "C18IPUG": "integer",
    "C18UGPRF": "integer",
    "C18ENPRF": "integer",
    "C18SZSET": "integer",
    "IALIAS": "string",
    "INSTSIZE": "integer"
}

# 12 columns
ic2020_columns = {
    # "RELAFFIL": "integer",
    # "OPENADMP": "integer",
    "SLO5": "integer",
    "ASSOC1": "integer",
    "SPORT1": "integer",
    "SPORT2": "integer",
    "SPORT3": "integer",
    "SPORT4": "integer",
    "CALSYS": "integer",
    "APPLFEEU": "integer",
    "FT_UG": "integer",
    "RMBRDAMT": "integer"
}

# 11 columns
adm2020_columns = {
    "ADMCON1": "integer",
    "ADMCON2": "integer",
    "ADMCON3": "integer",
    "ADMCON4": "integer",
    "ADMCON5": "integer",
    "ADMCON6": "integer",
    "ADMCON7": "integer",
    "SATPCT": "integer",
    "ACTPCT": "integer",
    "ENRLM": "integer",
    "ENRLW": "integer"
}

# 5 columns
drvadm2020_columns = {
    "DVADM02": "integer",
    "DVADM03": "integer",
    "DVADM08": "integer",
    "DVADM09": "integer",
    "DVADM04": "integer"
}

# 1 column
ic2020mission_columns = {
    "MISSION": "string"
}

# 4 columns
drvic2020_columns = {
    "CINSON": "integer",
    "COTSON": "integer",
    "CINSOFF": "integer",
    "COTSOFF": "integer"
}

# 2 columns
ic2020_ay_columns = {
    "TUITION2": "integer",
    "TUITION3": "integer"
}

# 1 column
# ef2020a_columns = {
#     "EFNRALT": "integer"
# }

# 2 columns
ef2020b_columns = {
    "EFAGE07": "integer",
    "EFAGE08": "integer"
}

# 3 columns
# didn't include percent enrolled ethnicity, currently included
drvef2020_columns = {
    "ENRTOT": "integer",
    "EFUG": "integer",
    "EFGRAD": "integer"
}

# 1 column
ef2020d_columns = {
    "STUFACR": "integer"
}

# 52 columns overall

## Preprocessing Function

In [56]:
def concatenate_new_data(datatype_dict, dataset_name, df):
    colNames = list(datatype_dict.keys())
    with open(datatypes_route) as f:
        cur_datatypes = json.load(f)

    # print(len(cur_datatypes))

    for colName in colNames:
        cur_datatypes[colName] = datatype_dict[colName]

    # print(len(cur_datatypes))

    with open(datatypes_route, 'w') as f:
        f.write(json.dumps(cur_datatypes, indent=4))
    
    colNames.append("UNITID")

    v2df = spark.read.csv(v2_additional_route + dataset_name, header=True, inferSchema=True)
    v2df = v2df.withColumn("UNITID", F.col("UNITID").cast("string"))
    v2df = v2df.select(colNames)
    
    df = df.join(v2df, "UNITID", "left")
    print((df.count(), len(df.columns)))
    return df
    

## Merge Dataset

In [69]:
df = concatenate_new_data(hd2020_columns, "HD2020.csv", df)
df = concatenate_new_data(ic2020_columns, "IC2020.csv", df)
df = concatenate_new_data(adm2020_columns, "ADM2020.csv", df)
df = concatenate_new_data(drvadm2020_columns, "DRVADM2020.csv", df)
df = concatenate_new_data(ic2020mission_columns, "IC2020Mission_updated.csv", df)
df = concatenate_new_data(drvic2020_columns, "DRVIC2020.csv", df)
df = concatenate_new_data(ic2020_ay_columns, "IC2020_AY.csv", df)


(6694, 456)
(6694, 466)
(6694, 477)
(6694, 482)
(6694, 483)
(6694, 487)
(6694, 489)


In [70]:
df.select("UNITID").distinct().count()

6694

In [71]:
df = concatenate_new_data(ef2020b_columns, "EF2020B.csv", df)

(81498, 491)


In [76]:
df.select("EFAGE07", "EFAGE08", "UNITID").distinct().count()

58277

In [None]:
# df = concatenate_new_data(ef2020a_columns, "EF2020A.csv", df)
df = concatenate_new_data(ef2020b_columns, "EF2020B.csv", df)
df = concatenate_new_data(drvef2020_columns, "DRVEF2020.csv", df)
df = concatenate_new_data(ef2020d_columns, "EF2020D.csv", df)

In [None]:
new_df.head()

In [65]:


# hd2020_colName = list(hd2020_columns.keys())
# hd2020_colName.append("UNITID")

In [18]:
with open(datatypes_route) as f:
    cur_datatypes = json.load(f)

for colName in hd2020_colName:
    cur_datatypes[colName] = hd2020_columns[colName]

with open(datatypes_route, 'w') as f:
    f.write(json.dumps(cur_datatypes, indent=4))

In [53]:
hd2020 = spark.read.csv("./data/v2_additional_data/HD2020.csv", header=True, inferSchema=True)
hd2020 = hd2020.withColumn("UNITID", F.col("UNITID").cast("string"))
hd2020 = hd2020.select(hd2020_colName)

In [54]:
df = df.join(hd2020, "UNITID", "left")

In [55]:
print((df.count(), len(df.columns)))

(6694, 456)


In [56]:
df = df.replace({'NULL': None, 'null': None})

In [None]:
sc.stop()