In [1]:
# notebook dependencies 
%matplotlib inline
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 300

import pandas as pd
import numpy as np
import os

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# time module
from time import time

from initial_acquire import get_majors_df
import mm_prepare
import mm_acquire

In [2]:
def get_majors_df():

    '''Function to initially pull and merge the two (2) needed 
    College Scorecard tables for period 2018-2019.'''

    # checking if dataset exists
    filename = "majors_table.csv"
    
    if os.path.isfile(filename):
        
        df = pd.read_csv(filename)

        print(f'dataframe shape: {df.shape}')

        return df

    else:
        # checks local foldere for following files
        filename_01 = "FieldOfStudyData1718_1819_PP.csv"
        filename_02 = "MERGED2018_19_PP.csv"
        
        # created the necessary parent and child tables
        df_parent = pd.read_csv(filename_01, low_memory=False)
        df_child = pd.read_csv(filename_02, low_memory=False)

        df_parent["UNITID"] = df_parent["UNITID"].astype("Int32", errors='ignore')
        df_child["UNITID"] = df_child["UNITID"].astype("Int32", errors='ignore')

        df = df_parent.merge( 
        df_child,
        how = "left",
        on = "UNITID",
        copy = False
        )
        # cache the newly created dataframe as a .csv file
        df.to_csv("majors_table.csv")
        # print the df shape
        print(f'dataframe shape: {df.shape}')

        # return the dataframe
        return df

In [3]:
def get_bach_df():

    '''Function to initially pull and merge the two (2) needed 
    College Scorecard tables for period 2018-2019.
    
    The function then filters and returns bachelor degree records.'''

    # checking if dataset exists
    filename = "bach_table.csv"
    
    if os.path.isfile(filename):
        
        df = pd.read_csv(filename)

        print(f'dataframe shape: {df.shape}')

        return df

    else:
        # checks local foldere for following files
        filename_01 = "FieldOfStudyData1718_1819_PP.csv"
        filename_02 = "MERGED2018_19_PP.csv"
        
        # created the necessary parent and child tables
        df_parent = pd.read_csv(filename_01, low_memory=False)
        df_child = pd.read_csv(filename_02, low_memory=False)

        df_parent["UNITID"] = df_parent["UNITID"].astype("Int32", errors='ignore')
        df_child["UNITID"] = df_child["UNITID"].astype("Int32", errors='ignore')

        df = df_parent.merge( 
        df_child,
        how = "left",
        on = "UNITID",
        copy = False
        )

        # filters for just bachelor specific records
        bach_df = df[df["CREDDESC"] == "Bachelors Degree"]

        # initial filter of columns with >= 50% missing records
        bach_df = bach_df[[ 
            "ACTCMMID",
            "ADM_RATE",
            "ADMCON7",
            "AVGFACSAL",
            "C150_4_2MOR",
            "C150_4_AIAN",
            "C150_4_ASIAN",
            "C150_4_BLACK",
            "C150_4_HISP",
            "C150_4_NRA",
            "C150_4_UNKN",
            "C150_4_WHITE",
            "C150_4",
            "CIPCODE",
            "CIPDESC",
            "CITY",
            "CREDDESC",
            "CREDLEV",
            "D_PCTPELL_PCTFLOAN",
            "DEBT_ALL_PP_EVAL_MDN",
            "DEBT_ALL_PP_EVAL_MDN10YRPAY",
            "DEBT_ALL_PP_EVAL_MEAN",
            "DEBT_ALL_STGP_EVAL_MDN",
            "DEBT_ALL_STGP_EVAL_MDN10YRPAY",
            "DEBT_ALL_STGP_EVAL_MEAN",
            "DEBT_MDN",
            "DEBT_NOPELL_STGP_EVAL_MDN",
            "DEBT_NOPELL_STGP_EVAL_MEAN",
            "DEBT_PELL_PP_EVAL_MDN",
            "DEBT_PELL_PP_EVAL_MEAN",
            "DEBT_PELL_STGP_EVAL_MDN",
            "DEBT_PELL_STGP_EVAL_MEAN",
            "DISTANCEONLY",
            "FEMALE_DEBT_MDN",
            "FIRSTGEN_DEBT_MDN",
            "FTFTPCTFLOAN",
            "FTFTPCTPELL",
            "GRAD_DEBT_MDN",
            "GRADS",
            "HI_INC_DEBT_MDN",
            "IND_DEBT_MDN",
            "INSTNM_x",
            "LO_INC_DEBT_MDN",
            "MALE_DEBT_MDN",
            "MD_INC_DEBT_MDN",
            "NOPELL_DEBT_MDN",
            "NOTFIRSTGEN_DEBT_MDN",
            "NUM4_PRIV",
            "NUMBRANCH",
            "OPEFLAG",
            "PCIP01",
            "PCIP03",
            "PCIP04",
            "PCIP05",
            "PCIP09",
            "PCIP10",
            "PCIP11",
            "PCIP12",
            "PCIP13",
            "PCIP14",
            "PCIP15",
            "PCIP16",
            "PCIP19",
            "PCIP22",
            "PCIP23",
            "PCIP24",
            "PCIP25",
            "PCIP26",
            "PCIP27",
            "PCIP29",
            "PCIP30",
            "PCIP31",
            "PCIP38",
            "PCIP39",
            "PCIP40",
            "PCIP41",
            "PCIP42",
            "PCIP43",
            "PCIP44",
            "PCIP45",
            "PCIP46",
            "PCIP47",
            "PCIP48",
            "PCIP49",
            "PCIP50",
            "PCIP51",
            "PCIP52",
            "PCIP54",
            "PELL_DEBT_MDN",
            "PFTFTUG1_EF",
            "PPTUG_EF",
            "PREDDEG",
            "REGION",
            "RET_FT4",
            "RET_PT4",
            "SAT_AVG",
            "SCH_DEG",
            "STABBR",
            "UGDS_2MOR",
            "UGDS_AIAN",
            "UGDS_ASIAN",
            "UGDS_BLACK",
            "UGDS_HISP",
            "UGDS_NHPI",
            "UGDS_NRA",
            "UGDS_UNKN",
            "UGDS_WHITE",
            "UGNONDS",
            "UNITID",
            "WDRAW_DEBT_MDN",
            "ZIP"]]
            
        # cache the newly created dataframe as a .csv file
        bach_df.to_csv("majors_table.csv")
        
        # print the df shape
        print(f'dataframe shape: {bach_df.shape}')

        # return the dataframe
        return bach_df

In [4]:
# test the function

df = mm_acquire.get_bach_df()
df.head()

dataframe shape: (71901, 111)


Unnamed: 0,ACTCMMID,ADM_RATE,ADMCON7,AVGFACSAL,C150_4_2MOR,C150_4_AIAN,C150_4_ASIAN,C150_4_BLACK,C150_4_HISP,C150_4_NRA,...,UGDS_BLACK,UGDS_HISP,UGDS_NHPI,UGDS_NRA,UGDS_UNKN,UGDS_WHITE,UGNONDS,UNITID,WDRAW_DEBT_MDN,ZIP
0,18.0,0.8986,1.0,7101.0,0.25,,,0.2681,0.25,,...,0.912,0.0088,0.0016,0.007,0.0361,0.0186,153.0,100654,10250,35762
1,18.0,0.8986,1.0,7101.0,0.25,,,0.2681,0.25,,...,0.912,0.0088,0.0016,0.007,0.0361,0.0186,153.0,100654,10250,35762
2,18.0,0.8986,1.0,7101.0,0.25,,,0.2681,0.25,,...,0.912,0.0088,0.0016,0.007,0.0361,0.0186,153.0,100654,10250,35762
5,18.0,0.8986,1.0,7101.0,0.25,,,0.2681,0.25,,...,0.912,0.0088,0.0016,0.007,0.0361,0.0186,153.0,100654,10250,35762
6,18.0,0.8986,1.0,7101.0,0.25,,,0.2681,0.25,,...,0.912,0.0088,0.0016,0.007,0.0361,0.0186,153.0,100654,10250,35762


In [5]:
df = mm_prepare.clean_col_names(df)
df = mm_prepare.clean_bach_df(df)
df.head()

modified df shape: (71901, 99)


Unnamed: 0,ACT_score_mid,admission_rate,required_score,avg_faculty_salary,comp_rt_ft_150over_expected_time_two_races,comp_rt_ft_150over_expected_time_native_american,comp_rt_ft_150over_expected_time_asian,comp_rt_ft_150over_expected_time_black,comp_rt_ft_150over_expected_time_hispanic,comp_rt_ft_150over_expected_time_non_resident,...,enrollment_share_black,enrollment_share_hispanic,enrollment_share_pac_islander,enrollment_share_non_resident,enrollment_share_unknown,enrollment_share_white,non_deg_seeking,unit_id_institution,not_completed_med_debt,zip_code
0,18.0,0.8986,1.0,7101.0,0.25,,,0.2681,0.25,,...,0.912,0.0088,0.0016,0.007,0.0361,0.0186,153.0,100654,10250,35762
1,18.0,0.8986,1.0,7101.0,0.25,,,0.2681,0.25,,...,0.912,0.0088,0.0016,0.007,0.0361,0.0186,153.0,100654,10250,35762
2,18.0,0.8986,1.0,7101.0,0.25,,,0.2681,0.25,,...,0.912,0.0088,0.0016,0.007,0.0361,0.0186,153.0,100654,10250,35762
5,18.0,0.8986,1.0,7101.0,0.25,,,0.2681,0.25,,...,0.912,0.0088,0.0016,0.007,0.0361,0.0186,153.0,100654,10250,35762
6,18.0,0.8986,1.0,7101.0,0.25,,,0.2681,0.25,,...,0.912,0.0088,0.0016,0.007,0.0361,0.0186,153.0,100654,10250,35762
