In [1]:
from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col

In [2]:
import json

In [3]:
# Initialize SparkContext
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [4]:
# import dataset
df_filename = "../data/college-search-data-v3.parquet"
df = spark.read.load(df_filename)

In [5]:
MAP_PATH = "./data/college_map.json"

COLLEGEDATA_PATH = "./data/collegedata_general.json"

MAJORLIST_PATH = "./data/collegedata_major_list.json"

INTEGRATED_PATH = "./data/collegedata_integrated.json"

HIGHSCHOOL_UNIT_PATH = "./data/collegedata_highschool_unit.json"

In [6]:
collegedata_general_integrated = {}

major_list = {}

with open(MAJORLIST_PATH) as f:
    major_list = json.load(f)

highschool_unit = {}

with open(HIGHSCHOOL_UNIT_PATH) as f:
    highschool_unit = json.load(f)

college_map = {}

with open(MAP_PATH) as f:
    college_map = json.load(f)

valid_name_count = 0
for collegedata_name in college_map.keys():
    if college_map[collegedata_name] != "":
        valid_name_count += 1
print(valid_name_count)

1772


## Integrate major list and clean up collegedata

In [7]:
with open(COLLEGEDATA_PATH) as f:
    collegedata_general = json.load(f)
    for college_name, college_data in collegedata_general.items():
        # remove unused variables
        college_data["admission"].pop("Address", None)
        college_data["admission"].pop("Phone", None)
        college_data["admission"].pop("Overall Admission Rate", None)
        college_data["admission"].pop("Students Enrolled", None)
        college_data["admission"].pop("High School Class Rank", None)

        college_data["financials"].pop("Net Price Calculator", None)

        college_data["campus_life"].pop("Campus Size", None)
        college_data["campus_life"].pop("Nearest Bus Station", None)
        college_data["campus_life"].pop("Nearest Train Station", None)
        college_data["campus_life"].pop("College Housing", None)
        college_data["campus_life"].pop("Types of Housing", None)
        college_data["campus_life"].pop("Housing Requirements", None)
        college_data["campus_life"].pop("Students Living Off Campus/Commuting", None)
        college_data["campus_life"].pop("Off-Campus Housing Assistance", None)

        # format population variable
        for campus_life_key in college_data["campus_life"].keys():
            key_word_list = campus_life_key.split(" ")
            if len(key_word_list) >= 2:
                if key_word_list[-1] == "Population":
                    college_data["campus_life"]["City Population"] = college_data["campus_life"].pop(campus_life_key, "Not reported")
                    break

        college_data["students"].pop("Average Starting Salary", None)
        college_data["students"].pop("Disciplines Pursued", None)

        # integrate major list information
        college_data["undergraduate majors"] = major_list[college_name]["undergraduate majors"]
        
        # integrate high school unit req/rec (skip if data exists)
        college_data["admission"]["high_school_req_rec_data"] = highschool_unit[college_name]

        college_integrate_name = college_map[college_name]
        if college_integrate_name is not None and college_integrate_name != "":
            collegedata_general_integrated[college_integrate_name] = college_data

In [8]:
with open(INTEGRATED_PATH, "w") as f:
    f.write(json.dumps(collegedata_general_integrated, indent=4))

In [9]:
len(collegedata_general_integrated.keys())

1769

## Upload Scraped Data to MongoDB

In [10]:
import pymongo

In [13]:
uri = "" # in cledge-db resource, then click Quick Start.
client = pymongo.MongoClient(uri)

db = client["colleges"]
collection = db["colleges-data"]


In [15]:
with open(INTEGRATED_PATH) as f:
    integrated_data = json.load(f)

In [33]:

for key, value in integrated_data.items():
    cur_college = value

    # flatten college name and college id
    cur_college["college_name"] = key
    cur_college["college_id"] = df.filter(col("INSTNM") == key).select("UNITID").collect()[0][0]

    collection.insert_one(cur_college)

## Update the current database

In [17]:
for key, value in integrated_data.items():
    cur_college = value

    # update the field you want (in this case, the admission section)
    collection.update_one({'college_name': key}, {"$set":{"admission": cur_college["admission"]}})

## Test Data inside Azure Cosmos DB

In [None]:
result = collection.find({"college_id": "236948"})
result[0]