In [51]:
from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col
import json
import pandas as pd

# Initialize SparkContext
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [163]:
V3_DATA_PATH = "./data/college-search-data-v3+.parquet"
DATA_TYPES_PATH = "./assets/datatypes.json"

In [164]:
# import dataset
df = spark.read.load(V3_DATA_PATH)

## Add APPLCN from ADM2020

In [24]:
# add APPLCN (in IPEDS datasets, ADM2020) to the dataset from cosmos db

def add_datatypes(data_columns):

    with open(DATA_TYPES_PATH) as f:
        cur_datatypes = json.load(f)

    for variable, datatype in data_columns.items():
        cur_datatypes[variable] = datatype

    with open(DATA_TYPES_PATH, 'w') as f:
        f.write(json.dumps(cur_datatypes, indent=4))

In [25]:
adm2020_df = spark.read.csv("./data/v2_additional_data/ADM2020.csv", header=True, inferSchema=True)

add_columns = {
    "APPLCN": "integer"
}

add_datatypes(add_columns)

In [26]:
override_adm = adm2020_df.select("UNITID", "APPLCN")
override_adm = override_adm.withColumn("UNITID", F.col("UNITID").cast("string"))

In [27]:
df = df.join(override_adm, "UNITID", "left")

## Add Ranking data from cosmosDB

In [165]:
import pymongo

In [166]:
rank_add_columns = {
    "National Universities": "integer",
    "Study Abroad": "integer",
    "Learning Communities": "integer",
    "Undergraduate Research/Creative Projects": "integer",
    "Service Learning": "integer",
    "First-Year Experiences": "integer",
    "Senior Capstone": "integer",
    "Best Undergraduate Teaching": "integer",
    "Top Performers on Social Mobility": "integer",
    "Regional Colleges South": "integer",
    "Top Public Schools": "integer",
    "Regional Universities West": "integer",
    "Best Value Schools": "integer",
    "Regional Colleges Midwest": "integer",
    "Most Innovative Schools": "integer",
    "National Liberal Arts Colleges": "integer",
    "Writing in the Disciplines": "integer",
    "Co-ops/Internships": "integer",
    "Historically Black Colleges and Universities": "integer",
    "Regional Colleges West": "integer",
    "Regional Universities South": "integer",
    "Regional Universities North": "integer",
    "Best Colleges for Veterans": "integer",
    "Regional Universities Midwest": "integer",
    "Regional Colleges North": "integer",
    "Tribal Schools": "integer",
    "Best Undergraduate Engineering Programs": "integer",
    "Arts Schools": "integer",
    "Faith-related Schools": "integer",
    "Business Schools": "integer",
    "Miscellaneous Schools": "integer",
    "Engineering & Technology Schools": "integer",
    "Other Schools": "integer",
    "Health Professions Schools": "integer",
    "Nursing": "integer",
    "International Business": "integer",
    "Analytics": "integer",
    "Management Information Systems": "integer",
    "Production / Operation Management": "integer",
    "Supply Chain Management / Logistics": "integer",
    "Entrepreneurship": "integer",
    "Accounting": "integer",
    "Finance": "integer",
    "Civil": "integer",
    "Computer": "integer",
    "Electrical / Electronic / Communications": "integer",
    "Mechanical": "integer",
    "Aerospace /Aeronautical / Astronautical": "integer",
    "Chemical": "integer",
    "Materials": "integer",
    "Computer Science": "integer",
    "Artificial Intelligence": "integer",
    "Theory": "integer",
    "Environmental / Environmental Health": "integer",
    "Business Programs": "integer",
    "Quantitative Analysis": "integer",
    "Biocomputing/Bioinformatics/Biotechnology": "integer",
    "Computer Systems": "integer",
    "Cybersecurity": "integer",
    "Data Analytics/Science": "integer",
    "Game/Simulation Development": "integer",
    "Mobile/Web Applications": "integer",
    "Programming Languages": "integer",
    "Software Engineering": "integer",
    "Biological / Agricultural": "integer",
    "Industrial / Manufacturing": "integer",
    "Management": "integer",
    "Real Estate": "integer",
    "Biomedical": "integer",
    "Insurance": "integer",
    "Marketing": "integer",
    "Petroleum": "integer"
}

add_datatypes(rank_add_columns)

In [167]:
# remove space for valid column names
rank_add_columns_valid = {}
for key,val in rank_add_columns.items():
    valid_col_name = ''.join(key.split(" "))
    valid_col_name = ''.join(valid_col_name.split("/"))
    valid_col_name = ''.join(valid_col_name.split("-"))
    valid_col_name = ''.join(valid_col_name.split("&"))
    rank_add_columns_valid[valid_col_name] = "integer"

add_datatypes(rank_add_columns_valid)

In [168]:
uri = "mongodb://cledge-db2:cXaeZmOW1tk5LKoRLIf26fljO8RN9UYFRThhQlGida5PTascMv6GCVWsjlgx4Qo3uNbMKNyMKb6UACDbprAlag==@cledge-db2.mongo.cosmos.azure.com:10255/?ssl=true&replicaSet=globaldb&retrywrites=false&maxIdleTimeMS=120000&appName=@cledge-db2@" # in cledge-db resource, then click Quick Start.
client = pymongo.MongoClient(uri)

db = client["colleges"]
collection = db["colleges-data"]

In [169]:
response = collection.find({"us_news_rankings": {'$exists': True}}, {"college_id": 1, "us_news_rankings": 1, "_id": 0})

In [170]:
colleges_with_rank_data = []

for item in response:
    each_college_rank_data = {}
    each_college_rank_data["UNITID"] = item["college_id"]
    for cat,cat_val in item["us_news_rankings"].items():
        for rank,rank_val in cat_val.items():
            each_college_rank_data[rank] = rank_val
    colleges_with_rank_data.append(each_college_rank_data)

In [138]:
colleges_with_rank_data

[{'UNITID': '222178',
  'National Universities': 250,
  'Study Abroad': 35,
  'Learning Communities': 9,
  'Undergraduate Research/Creative Projects': 28,
  'Service Learning': 5,
  'First-Year Experiences': 8,
  'Senior Capstone': 22,
  'Best Undergraduate Teaching': 41,
  'Top Performers on Social Mobility': 361,
  'Best Undergraduate Engineering Programs': 130,
  'Nursing': 293},
 {'UNITID': '138558',
  'Regional Colleges South': 72,
  'Top Public Schools': 21,
  'Top Performers on Social Mobility': 58},
 {'UNITID': '108232',
  'Regional Universities West': 88,
  'Top Performers on Social Mobility': 114},
 {'UNITID': '126182',
  'Regional Universities West': 103,
  'Top Public Schools': 52,
  'Top Performers on Social Mobility': 38},
 {'UNITID': '188429',
  'National Universities': 182,
  'Best Value Schools': 102,
  'First-Year Experiences': 49,
  'Top Performers on Social Mobility': 140,
  'Nursing': 247},
 {'UNITID': '168528',
  'Regional Colleges Midwest': 19,
  'Best Value Scho

In [171]:
rank_df_list = []

for each_college in colleges_with_rank_data:
    rank_df_schema = {"UNITID": None}
    for key,val in rank_add_columns.items():
        valid_col_name = ''.join(key.split(" "))
        valid_col_name = ''.join(valid_col_name.split("/"))
        valid_col_name = ''.join(valid_col_name.split("-"))
        valid_col_name = ''.join(valid_col_name.split("&"))
        rank_df_schema[valid_col_name] = None
    for college_rank_cat,val in each_college.items():
        # remove space to ensure valid column name
        valid_col_name = ''.join(college_rank_cat.split(" "))
        valid_col_name = ''.join(valid_col_name.split("/"))
        valid_col_name = ''.join(valid_col_name.split("-"))
        valid_col_name = ''.join(valid_col_name.split("&"))
        rank_df_schema[valid_col_name] = val
    rank_df_list.append(rank_df_schema)

In [172]:
rank_df_list

[{'UNITID': '222178',
  'NationalUniversities': 250,
  'StudyAbroad': 35,
  'LearningCommunities': 9,
  'UndergraduateResearchCreativeProjects': 28,
  'ServiceLearning': 5,
  'FirstYearExperiences': 8,
  'SeniorCapstone': 22,
  'BestUndergraduateTeaching': 41,
  'TopPerformersonSocialMobility': 361,
  'RegionalCollegesSouth': None,
  'TopPublicSchools': None,
  'RegionalUniversitiesWest': None,
  'BestValueSchools': None,
  'RegionalCollegesMidwest': None,
  'MostInnovativeSchools': None,
  'NationalLiberalArtsColleges': None,
  'WritingintheDisciplines': None,
  'CoopsInternships': None,
  'HistoricallyBlackCollegesandUniversities': None,
  'RegionalCollegesWest': None,
  'RegionalUniversitiesSouth': None,
  'RegionalUniversitiesNorth': None,
  'BestCollegesforVeterans': None,
  'RegionalUniversitiesMidwest': None,
  'RegionalCollegesNorth': None,
  'TribalSchools': None,
  'BestUndergraduateEngineeringPrograms': 130,
  'ArtsSchools': None,
  'FaithrelatedSchools': None,
  'BusinessSc

In [173]:
## set up schema to get rid of None data errors

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema_struct = [StructField("UNITID", StringType(), True)]
for key in rank_add_columns_valid.keys():
    schema_struct.append(StructField(key, IntegerType(), True))

In [174]:
rank_df = spark.createDataFrame(rank_df_list, schema=StructType(schema_struct))

In [175]:
df = df.join(rank_df, "UNITID", "left")

In [176]:
df.write.save('./data/college-search-data-v4.parquet')

In [177]:
print((df.count(), len(df.columns)))

(6694, 627)


In [16]:
sc.stop()