# Set up assets that are needed for data preprocessing

References:

https://github.com/Cledge-org/cledge/blob/dev/features/college_search_tool/assets/

https://github.com/Cledge-org/cledge/blob/dev/features/college_search_tool/src/college_search_index_builder.ipynb

## 1. Update necessary fields

In [60]:
import pandas as pd
import json
from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql.functions as F

In [62]:
# Initialize SparkContext
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [53]:
# read the data
data = pd.read_csv("./data/Most-Recent-Cohorts-All-Data-Elements.csv")
data_dict = pd.read_csv("./data/Institution_data_dictionary.csv")

data.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(6694, 2989)

In [54]:
# read previous fields.json file
with open("./assets/fields.json") as prevFields:
    prevfields_json = json.load(prevFields)

# Select columns
data_dict_name_map = data_dict[['VARIABLE NAME', 'developer-friendly name']]

# re-insert all variable-name & developer-friendly name pairs
for variable_name in prevfields_json.keys():
    prevfields_json[variable_name] = data_dict_name_map.loc[data_dict_name_map["VARIABLE NAME"] == variable_name, "developer-friendly name"].values[0]

# write the updated fields into fields.json
with open("./assets/fields.json", "w") as curFields:
    curFields.write(json.dumps(prevfields_json, indent=4))

# filter dataset with only fields in the fields.json
with open("./assets/fields.json") as curFields:
    dataFields = json.load(curFields)
    data = data[list(dataFields.keys())]
data.shape

(6694, 472)

## 2. Update and Check Data Types

In [44]:
# read previous datatypes.json file
with open("./assets/datatypes.json") as prevDTypes:
    prevDTypes_json = json.load(prevDTypes)

# Select API data type and VARIABLE NAME from data_dict
data_dict_data_type = data_dict[['VARIABLE NAME', 'API data type']]

for dtype_name in prevDTypes_json.keys():
    prevDTypes_json[dtype_name] = data_dict_data_type.loc[data_dict_data_type['VARIABLE NAME'] == dtype_name, "API data type"].values[0]

# write the updated fields into datatypes.json
with open("./assets/datatypes.json", "w") as curDTypes:
    curDTypes.write(json.dumps(prevDTypes_json, indent=4))

# manually edit "autocomplete" data type into "string" ("INSTNM" and "CITY")

In [61]:
# save current data into csv
data.to_csv("./data/Necessary-Fields-Data.csv")

In [71]:
# read dataset using PySpark
df = spark.read.csv("./data/Necessary-Fields-Data.csv", header=True, inferSchema=True)

In [65]:
# use PySpark to check and convert data types
with open('./assets/datatypes.json') as f:
    datatypes = json.load(f)

# cast columns to correct datatypes
for field, datatype in datatypes.items(): # takes a few mins to run
    if field not in df.columns:
        continue
    curr_type = dict(df.dtypes)[field]
    if curr_type != datatype and not datatype.startswith(curr_type):
        df = df.withColumn(field, F.col(field).cast(datatype))

In [72]:
print(len(df.columns))

473


## 3. Filter Empty (Null) Values

In [80]:
# Replace all "NULL" values in dataframe with literal null values so that isnull() can be used
df = df.replace({'NULL': None, 'null': None})

In [84]:
null_counts = df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns]).collect()[0].asDict()

In [85]:
# Drop fields that are completely null
num_rows = df.count()

for field, null_count in null_counts.items():
    if null_count >= num_rows: # all values of field are null, drop the field
        df = df.drop(field)
print(len(df.columns)) # number of fields that are not completely null

447


In [86]:
# Save dictionary of null value counts for reference
with open('./assets/null_counts.json', 'w') as f:
    json.dump(null_counts, f)

In [87]:
# Test that casting worked: Howard University should be true for HBCU
df.where(df.INSTNM.contains('Howard')).select(["INSTNM", "HBCU"]).show(truncate=False)

+---------------------------------+----+
|INSTNM                           |HBCU|
+---------------------------------+----+
|Howard University                |1.0 |
|Howard Community College         |0.0 |
|Specs Howard School of Media Arts|0.0 |
|Howard College                   |0.0 |
|Howard Payne University          |0.0 |
+---------------------------------+----+



In [88]:
# replace "CCBASIC" with any field name (or add other field names to the select list) from fields.json to print out specific fields for UW campuses
df.where(df.INSTNM.contains('University of Washington')).select(["INSTNM", "CCBASIC"]).show(truncate=False)

+---------------------------------------+-------+
|INSTNM                                 |CCBASIC|
+---------------------------------------+-------+
|University of Washington-Seattle Campus|15.0   |
|University of Washington-Bothell Campus|18.0   |
|University of Washington-Tacoma Campus |18.0   |
+---------------------------------------+-------+



In [None]:
# save the dataframe
df_filename = './data/college-search-data.parquet'
df.write.save(df_filename)

In [None]:
sc.stop()

In [59]:
# with open("./assets/datatypes.json") as curDTypes:
#     cur_dataTypes = json.load(curDTypes)

# for column in data.columns:
#     col_dataType = str(data[column].dtypes)
#     expect_dataType = cur_dataTypes[column]
#     is_equal = False
#     if expect_dataType == "integer":
#         if not col_dataType == "int64":
#             data[column].astype("int", errors='ignore')
#     elif expect_dataType == "float":
#         if not col_dataType == "float64":
#             data[column].astype("float", errors='ignore')
#     else:
#         if not col_dataType == "object":
#             data[column].astype("string", errors='ignore')

In [58]:
# for column in data.columns:
#     col_dataType = str(data[column].dtypes)
#     expect_dataType = cur_dataTypes[column]
#     is_equal = False
#     if expect_dataType == "integer":
#         if col_dataType == "int64":
#             is_equal = True
#     elif expect_dataType == "float":
#         if col_dataType == "float64":
#             is_equal = True
#     else:
#         if col_dataType == "string" or col_dataType == "object":
#             is_equal = True
#     if not is_equal:
#         print(column)