**Importing required libraries & data**

In [None]:
import pandas as pd
from os.path import join

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
file_path = '/content/drive/Shareddrives/.../csv files/'
c2021_a = 'c2021_a - completions by demo.csv'
hd_2021 = 'hd2021 - institutional characteristics.csv'
sfa2021 = 'sfa2021 - student financial aid.csv'
effy2021 = 'effy2021 - enrollment by demo.csv'
ic2021 = 'ic2021 - student disability.csv'

In [None]:
df_completion = pd.read_csv(join(file_path, c2021_a)).fillna('')
df_finance = pd.read_csv(join(file_path, sfa2021)).fillna('')
df_institution = pd.read_csv(join(file_path, hd_2021), encoding="ISO-8859-1").fillna('')
df_enrollment = pd.read_csv(join(file_path, effy2021)).fillna('')
df_disability = pd.read_csv(join(file_path, ic2021)).fillna('')

In [None]:
df_completion.shape, df_finance.shape, df_institution.shape, df_enrollment.shape, df_disability.shape

((296343, 64), (5779, 631), (6289, 74), (103604, 64), (6179, 123))

**Pre-processing steps**

In [None]:
# Changing data types of UNITID & CIPCODE
df_completion['UNITID'] = df_completion['UNITID'].astype('string')
df_completion['CIPCODE'] = df_completion['CIPCODE'].astype('string')
df_finance['UNITID'] = df_finance['UNITID'].astype('string')
df_institution['UNITID'] = df_institution['UNITID'].astype('string')
df_enrollment['UNITID'] = df_enrollment['UNITID'].astype('string')
df_disability['UNITID'] = df_disability['UNITID'].astype('string')

**Filter dataframes**
<p>1. In c2021_a (completion), find all universities that have cipcode 11.0101.
<p>2. In hd_2021 (institution), find all universities in step 1 that have hloffer is 9.
<p>3. In sf2021 (finance), find all universities in steps 1 & 2 in this dataframe.
<p>4. In effy2021 (enrollment), find all universities in steps 1 & 2 in this dataframe.
<p>4. In ic2021 (disability), find all universities in steps 1 & 2 in this dataframe.

In [None]:
# Completion dataframe with only relevant cipcodes
cipcode = input("What is the cipcode of the major you're interested in? \nIf multiple cipcodes, separate them by commas with no spaces. For example: 11.0100,11.0101,11.2020 \nPlease see the CIPCODES & HIGHEST DEGREE OFFERED DOCUMENTATION")
cipcode = cipcode.split(',')
try:
  df_completion_cipcodes = df_completion[df_completion['CIPCODE'].isin(cipcode)]
except:
  print("The cipcode(s) you entered is not valid.")

In [None]:
# Produce list of unitids with relevant cipcodes
relevant_unitids = list(df_completion_cipcodes.UNITID)

In [None]:
# Instition dataframe with only relevant cipcodes & relevant higest degree granted for institutions
hloffer = input("You're likely interested in only universities whose highest degree granted is Associate's (3), Bachelor's (5), Master's (7), and/or PhD (9). \nWhat number(s) corresponds to the degree(s) you're interested in? If multiple numbers, separate them by commas with no spaces. For example: 5,7,9 \nPlease see the CIPCODES & HIGHEST DEGREE OFFERED DOCUMENTATION")
hloffer = hloffer.split(',')
hloffer_list = []
try:
  for number in hloffer:
    number = int(number)
    hloffer_list.append(number)
  df_institution_unitids = df_institution[(df_institution['UNITID'].isin(relevant_unitids)) & (df_institution['HLOFFER'].isin(hloffer_list))]
except:
  print("The number(s) you entered is not valid.")

In [None]:
# Produce list of unitids with relevant cipcodes & relevant higest degree granted for institutions
target_unitids = list(df_institution_unitids.UNITID)

In [None]:
# Finance dataframe with only relevant cipcodes & highest degree offered = PhD
df_finance_unitids = df_finance[(df_finance['UNITID'].isin(target_unitids))]

In [None]:
# Enrollment dataframe with only relevant cipcodes & highest degree offered = PhD
df_enrollment_unitids = df_enrollment[(df_enrollment['UNITID'].isin(target_unitids))]

In [None]:
# Disability dataframe with only relevant cipcodes & highest degree offered = PhD
df_disability_unitids = df_disability[(df_disability['UNITID'].isin(target_unitids))]

In [None]:
# Completion dataframe with only relevant cipcodes & highest degree offered = PhD
df_completion_unitids = df_completion_cipcodes[(df_completion_cipcodes['UNITID'].isin(target_unitids))]

**Join dataframes**

In [None]:
# Merging institution, finance, and disability dataframes to make a final enrollment dataframe
merged_1 = pd.merge(df_institution_unitids, df_enrollment_unitids, on='UNITID', how='left') # merging institution with enrollment
merged_2 = pd.merge(merged_1, df_finance_unitids, on='UNITID', how='left') # merging first merged dataframe with finance
final_enrollment = pd.merge(merged_2, df_disability_unitids, on='UNITID', how='left') # merging second merged dataframe with disability

In [None]:
# Merging institution and completion dataframes to make a final completion dataframe
final_completion = pd.merge(df_institution_unitids, df_completion_unitids, on='UNITID', how='left')

**Delete irrelevant columns**

In [None]:
final_enrollment.columns, final_completion.columns

(Index(['UNITID', 'INSTNM', 'IALIAS', 'ADDR', 'CITY', 'STABBR', 'ZIP', 'FIPS',
        'OBEREG', 'CHFNM',
        ...
        'ASSOC5', 'ASSOC6', 'SPORT1', 'CONFNO1', 'SPORT2', 'CONFNO2', 'SPORT3',
        'CONFNO3', 'SPORT4', 'CONFNO4 '],
       dtype='object', length=889),
 Index(['UNITID', 'INSTNM', 'IALIAS', 'ADDR', 'CITY', 'STABBR', 'ZIP', 'FIPS',
        'OBEREG', 'CHFNM',
        ...
        'XCUNKNM', 'CUNKNM', 'XCUNKNW', 'CUNKNW', 'XCNRALT', 'CNRALT',
        'XCNRALM', 'CNRALM', 'XCNRALW', 'CNRALW  '],
       dtype='object', length=137))

In [None]:
# Only keep relevant variables (variables of interest) in enrollment
final_enrollment = final_enrollment[['UNITID', 'INSTNM', 'ADDR', 'STABBR', 'OBEREG', 'CONTROL', 'HBCU', 'MEDICAL', 'TRIBAL', 'CSA', 'UPGRNTP', 'DISABPCT', 'EFFYLEV', 'EFYTOTLT', 'EFYTOTLM',
                                 'EFYTOTLW', 'EFYAIANT', 'EFYAIANM', 'EFYAIANW', 'EFYASIAT', 'EFYASIAM', 'EFYASIAW', 'EFYBKAAT', 'EFYBKAAM', 'EFYBKAAW', 'EFYHISPT', 'EFYHISPM',
                                 'EFYHISPW', 'EFYNHPIT', 'EFYNHPIM', 'EFYNHPIW', 'EFYWHITT', 'EFYWHITM', 'EFYWHITW', 'EFY2MORT', 'EFY2MORM', 'EFY2MORW', 'EFYUNKNT', 'EFYUNKNM', 'EFYUNKNW',
                                 'EFYNRALT', 'EFYNRALM', 'EFYNRALW']]

In [None]:
# Only keep relevant variables (variables of interest) in completion
final_completion = final_completion[['UNITID', 'INSTNM', 'CIPCODE', 'MAJORNUM', 'AWLEVEL', 'CTOTALT', 'CTOTALM', 'CTOTALW', 'CAIANT', 'CAIANM', 'CAIANW', 'CASIAT', 'CASIAM', 'CASIAW', 'CBKAAT',
                                     'CBKAAM', 'CBKAAW', 'CHISPT', 'CHISPM', 'CHISPW', 'CNHPIT', 'CNHPIM', 'CNHPIW', 'CWHITT', 'CWHITM', 'CWHITW', 'C2MORT', 'C2MORM', 'C2MORW', 'CUNKNT','CUNKNM',
                                     'CUNKNW', 'CNRALT', 'CNRALM', 'CNRALW  ']]

In [None]:
# Change names of columns/variables to intuitive names in enrollment dataframe
replace = {'UNITID':'unitid', 'INSTNM':'iuniversity', 'ADDR':'iaddress', 'STABBR':'istate', 'OBEREG':'iregion', 'CONTROL':'iprivate_public', 'HBCU':'ihbcu', 'MEDICAL':'imedical_degree', 'TRIBAL':'itribal', 'CSA':'icsa', 'UPGRNTP':'fpell_grant', 'DISABPCT':'disability', 'EFFYLEV':'eundergrad_grad', 'EFYTOTLT':'etotal', 'EFYTOTLM':'emen',
           'EFYTOTLW':'ewomen', 'EFYAIANT':'eaian', 'EFYAIANM':'eaian_men', 'EFYAIANW':'eaian_women', 'EFYASIAT':'easian', 'EFYASIAM':'easian_men', 'EFYASIAW':'easian_women', 'EFYBKAAT':'eblack', 'EFYBKAAM':'eblack_men', 'EFYBKAAW':'eblack_women', 'EFYHISPT':'ehispanic',
           'EFYHISPM':'ehispanic_men', 'EFYHISPW':'ehispanic_women', 'EFYNHPIT':'enhpi', 'EFYNHPIM':'enhpi_men', 'EFYNHPIW':'enhpi_women', 'EFYWHITT':'ewhite', 'EFYWHITM':'ewhite_men', 'EFYWHITW':'ewhite_women', 'EFY2MORT':'emultiracial', 'EFY2MORM':'emultiracial_men',
           'EFY2MORW':'emultiracial_women', 'EFYUNKNT':'eunknown', 'EFYUNKNM':'eunknown_men', 'EFYUNKNW':'eunknown_women', 'EFYNRALT':'enonresident', 'EFYNRALM':'enonresident_men', 'EFYNRALW':'enonresident_women'}

final_enrollment = final_enrollment.rename(columns = replace)

In [None]:
# Change names of columns/variables to intuitive names in completion dataframe
replace = {'UNITID':'unitid', 'INSTNM':'iuniversity', 'CIPCODE':'cipcode', 'MAJORNUM':'cmajor_level', 'AWLEVEL':'cdegree_level', 'CTOTALT':'ctotal', 'CTOTALM':'cmen', 'CTOTALW':'cwomen', 'CAIANT':'caian', 'CAIANM':'caian_men', 'CAIANW':'caian_women', 'CASIAT':'casian', 'CASIAM':'casian_men', 'CASIAW':'casian_women', 'CBKAAT':'cblack',
           'CBKAAM':'cblack_men', 'CBKAAW':'cblack_women', 'CHISPT':'chispanic', 'CHISPM':'chispanic_men', 'CHISPW':'chispanic_women', 'CNHPIT':'cnhpi', 'CNHPIM':'cnhpi_men', 'CNHPIW':'cnhpi_women', 'CWHITT':'cwhite', 'CWHITM':'cwhite_men', 'CWHITW':'cwhite_women', 'C2MORT':'cmultiracial', 'C2MORM':'cmultiracial_men', 'C2MORW':'cmultiracial_women', 'CUNKNT':'cunknown',
           'CUNKNM':'cunknown_men', 'CUNKNW':'cunknown_women', 'CNRALT':'cnonresident', 'CNRALM':'cnonresident_men', 'CNRALW  ':'cnonresident_women'}

final_completion = final_completion.rename(columns = replace)

In [None]:
# Make a copy of the final dataframes in case of need for debugging below
f_enrollment = final_enrollment.copy()
f_completion = final_completion.copy()

In [None]:
# Limit eundergrad_grad to 1, 2, & 4 which are total, undergrad, & grad respectively in final_enrollment dataframe
final_enrollment = final_enrollment[(final_enrollment.eundergrad_grad == 1) | (final_enrollment.eundergrad_grad == 2) | (final_enrollment.eundergrad_grad == 4)]

In [None]:
# Change the numbers in final_enrollment to the corresponding words
final_enrollment['iregion_4'] = final_enrollment['istate']
replace_iregion_4 = {'AL':'South', 'AK':'West', 'AZ':'West', 'AR':'South', 'CA':'West', 'CO':'West', 'CT':'Northeast', 'DE':'South', 'DC':'South',
                  'FL':'South', 'GA':'South', 'HI':'West', 'ID':'West', 'IL':'Midwest', 'IN':'Midwest', 'IA':'Midwest', 'KS':'Midwest', 'KY':'South', 'LA':'South',
                  'ME':'Northeast', 'MD':'South', 'MA':'Northeast', 'MI':'Midwest', 'MN':'Midwest', 'MS':'South', 'MO':'Midwest', 'MT':'West', 'NE':'Midwest',
                  'NV':'West', 'NH':'Northeast', 'NJ':'Northeast', 'NM':'West', 'NY':'Northeast', 'NC':'South', 'ND':'Midwest', 'OH':'Midwest', 'OK':'South',
                  'OR':'West', 'PA':'Northeast', 'RI':'Northeast', 'SC':'South', 'SD':'Midwest', 'TN':'South', 'TX':'South', 'UT':'West', 'VT':'Northeast',
                  'VA':'South', 'WA':'West', 'WV':'South', 'WI':'Midwest', 'WY':'West', 'AS':	'Outlying areas', 'FM':'Outlying areas', 'GU':'Outlying areas',
                  'MH':'Outlying areas', 'MP':'Outlying areas', 'PW':'Outlying areas', 'PR':'Outlying areas', 'VI':'Outlying areas'}
final_enrollment['iregion_4'].replace(replace_iregion_4, inplace=True)

replace_ihbcu = {1:'Yes', 2:'No'}
final_enrollment['ihbcu'].replace(replace_ihbcu, inplace=True)

replace_itribal = {1:'Yes', 2:'No'}
final_enrollment['itribal'].replace(replace_itribal, inplace=True)

replace_imedical_degree = {1:'Yes', 2:'No'}
final_enrollment['imedical_degree'].replace(replace_imedical_degree, inplace=True)

replace_icsa = {-2:'Not Applicable'}
final_enrollment['icsa'].replace(replace_icsa, inplace=True)

replace_istate = {'AL':'Alabama', 'AK':'Alaska', 'AZ':'Arizona', 'AR':'Arkansas', 'CA':'California', 'CO':'Colorado', 'CT':'Connecticut', 'DE':'Delaware', 'DC':'District of Columbia',
                  'FL':'Florida', 'GA':'Georgia', 'HI':'Hawaii', 'ID':'Idaho', 'IL':'Illinois', 'IN':'Indiana', 'IA':'Iowa', 'KS':'Kansas', 'KY':'Kentucky', 'LA':'Louisiana',
                  'ME':'Maine', 'MD':'Maryland', 'MA':'Massachusetts', 'MI':'Michigan', 'MN':'Minnesota', 'MS':'Mississippi', 'MO':'Missouri', 'MT':'Montana', 'NE':'Nebraska',
                  'NV':'Nevada', 'NH':'New Hampshire', 'NJ':'New Jersey', 'NM':'New Mexico', 'NY':'New York', 'NC':'North Carolina', 'ND':'North Dakota', 'OH':'Ohio', 'OK':'Oklahoma',
                  'OR':'Oregon', 'PA':'Pennsylvania', 'RI':'Rhode Island', 'SC':'South Carolina', 'SD':'South Dakota', 'TN':'Tennessee', 'TX':'Texas', 'UT':'Utah', 'VT':'Vermont',
                  'VA':'Virginia', 'WA':'Washington', 'WV':'West Virginia', 'WI':'Wisconsin', 'WY':'Wyoming', 'AS':	'American Samoa', 'FM':'Federated States of Micronesia', 'GU':'Guam',
                  'MH':'Marshall Islands', 'MP':'Northern Marianas', 'PW':'Palau', 'PR':'Puerto Rico', 'VI':'Virgin Islands'}
final_enrollment['istate'].replace(replace_istate, inplace=True)

replace_iregion = {0:'US Service schools', 1:'New England', 2:'Mid East', 3:'Great Lakes', 4:'Plains', 5:'Southeast',
                   6:'Southwest', 7:'Rocky Mountains', 8:'Far West', 9:'Outlying areas', -3:'Not available'}
final_enrollment['iregion'].replace(replace_iregion, inplace=True)

replace_iprivate_public = {1:'Public', 2:'Private not-for-profit', 3:'Private for-profit'}
final_enrollment['iprivate_public'].replace(replace_iprivate_public, inplace=True)

replace_eundergrad_grad = {1:'Total students', 2:'Undergraduate students', 4:'Graduate students'}
final_enrollment['eundergrad_grad'].replace(replace_eundergrad_grad, inplace=True)

In [None]:
# Grouping by degrees (Associate's, Bachelor's, Master's, & PhD) for each university regardless of first major or second major (cmajor_level)
final_completion = f_completion.groupby(["unitid", "iuniversity", "cipcode", "cdegree_level"], as_index=False).sum()
del final_completion['cmajor_level'] # Delete the cmajor_level column as it is now irrelevant

In [None]:
# Limit cdegree_level to 3, 5, 7, 17, 18, & 19 which are associate's degrees to doctorate degrees, respectively in final_completion
final_completion = final_completion[(final_completion.cdegree_level == 3) |
                                    (final_completion.cdegree_level == 5) |
                                    (final_completion.cdegree_level == 7) |
                                    (final_completion.cdegree_level == 17) |
                                    (final_completion.cdegree_level == 18) |
                                    (final_completion.cdegree_level == 19)]

In [None]:
# Change the numbers in final_completion to the corresponding words
replace_cdegree_level = {3:"Associate's degree", 5:"Bachelor's degree", 7:"Master's degree", 17:"Doctor's degree", 18:"Doctor's degree", 19:"Doctor's degree"}
final_completion['cdegree_level'].replace(replace_cdegree_level, inplace=True)

In [None]:
# Make a list of the columns that will be in the final enrollment csv file (in order)
enroll = ['unitid', 'iuniversity', 'iaddress', 'istate', 'iregion', 'iregion_4', 'iprivate_public', 'ihbcu', 'imedical_degree', 'itribal', 'icsa', 'fpell_grant', 'disability', 'eundergrad_grad', 'etotal', 'emen', '% emen',
     'ewomen', '% ewomen', 'eaian', '% eaian', 'eaian_men', '% eaian_men', 'eaian_women', '% eaian_women', 'easian', '% easian', 'easian_men', '% easian_men',
     'easian_women', '% easian_women', 'eblack', '% eblack', 'eblack_men', '% eblack_men', 'eblack_women', '% eblack_women', 'ehispanic', '% ehispanic',
     'ehispanic_men', '% ehispanic_men', 'ehispanic_women', '% ehispanic_women', 'enhpi', '% enhpi', 'enhpi_men', '% enhpi_men', 'enhpi_women', '% enhpi_women',
     'ewhite', '% ewhite', 'ewhite_men', '% ewhite_men', 'ewhite_women', '% ewhite_women', 'emultiracial', '% emultiracial', 'emultiracial_men', '% emultiracial_men',
     'emultiracial_women', '% emultiracial_women', 'eunknown', '% eunknown', 'eunknown_men', '% eunknown_men', 'eunknown_women', '% eunknown_women', 'enonresident',
     '% enonresident', 'enonresident_men', '% enonresident_men', 'enonresident_women', '% enonresident_women']

# Create a new list to only include the relevant ethnic groups
enroll_groups = []
for e in enroll:
  if '%' not in e:
    enroll_groups.append(e)
pos_1 = enroll_groups.index('emen')
enroll_groups = enroll_groups[pos_1:]

In [None]:
# Make a function that calculates different ethnic groups divided by total enrolled
def epercent_ethnic(ethnic_group):
  final_enrollment['% ' + ethnic_group] = final_enrollment[ethnic_group]/final_enrollment['etotal']

In [None]:
# Adding calculation of ethnic groups divided by total enrolled to the enrollment dataframe
for ethn_1 in enroll_groups:
 epercent_ethnic(ethn_1)

In [None]:
# Make a list of the columns that will be in the final completion csv file (in order)
complete = ['unitid', 'iuniversity', 'cipcode', 'cdegree_level', 'ctotal', 'cmen', '% cmen', 'cwomen', '% cwomen', 'caian', '% caian', 'caian_men', '% caian_men',
            'caian_women', '% caian_women', 'casian', '% casian', 'casian_men', '% casian_men', 'casian_women', '% casian_women', 'cblack', '% cblack', 'cblack_men',
            '% cblack_men', 'cblack_women', '% cblack_women', 'chispanic', '% chispanic', 'chispanic_men', '% chispanic_men', 'chispanic_women', '% chispanic_women',
            'cnhpi', '% cnhpi', 'cnhpi_men', '% cnhpi_men', 'cnhpi_women', '% cnhpi_women', 'cwhite', '% cwhite', 'cwhite_men', '% cwhite_men', 'cwhite_women',
            '% cwhite_women', 'cmultiracial', '% cmultiracial', 'cmultiracial_men', '% cmultiracial_men', 'cmultiracial_women', '% cmultiracial_women', 'cunknown',
            '% cunknown', 'cunknown_men', '% cunknown_men', 'cunknown_women', '% cunknown_women', 'cnonresident', '% cnonresident', 'cnonresident_men', '% cnonresident_men',
            'cnonresident_women', '% cnonresident_women']

# Create a new list to only include the relevant ethnic groups
complete_groups = []
for c in complete:
  if '%' not in c:
    complete_groups.append(c)
pos_2 = complete_groups.index('cmen')
complete_groups = complete_groups[pos_2:]

In [None]:
# Make a function that calculates different ethnic groups divided by total enrolled
def cpercent_ethnic(ethnic_group):
  final_completion['% ' + ethnic_group] = final_completion[ethnic_group]/final_completion['ctotal']

In [None]:
# Adding calculation of ethnic groups divided by total enrolled to the completion dataframe
for ethn_2 in complete_groups:
  cpercent_ethnic(ethn_2)

In [None]:
# Put columns in enrollment dataframe in correct order
ecolumn_reorder = enroll
final_enrollment = final_enrollment.reindex(ecolumn_reorder, axis=1)

# Put columns in completion dataframe in correct order
ccolumn_reorder = complete
final_completion = final_completion.reindex(ccolumn_reorder, axis=1)

In [None]:
final_enrollment

Unnamed: 0,unitid,iuniversity,iaddress,istate,iregion,iregion_4,iprivate_public,ihbcu,imedical_degree,itribal,...,eunknown_men,% eunknown_men,eunknown_women,% eunknown_women,enonresident,% enonresident,enonresident_men,% enonresident_men,enonresident_women,% enonresident_women
0,100654,Alabama A & M University,4900 Meridian Street,Alabama,Southeast,South,Public,Yes,No,No,...,198,0.029873,381,0.057483,90,0.013579,43,0.006488,47,0.007091
1,100654,Alabama A & M University,4900 Meridian Street,Alabama,Southeast,South,Public,Yes,No,No,...,103,0.018629,155,0.028034,40,0.007235,19,0.003436,21,0.003798
6,100654,Alabama A & M University,4900 Meridian Street,Alabama,Southeast,South,Public,Yes,No,No,...,95,0.086442,226,0.205641,50,0.045496,24,0.021838,26,0.023658
22,100663,University of Alabama at Birmingham,Administration Bldg Suite 1070,Alabama,Southeast,South,Public,No,Yes,No,...,173,0.006571,263,0.009990,1289,0.048961,745,0.028298,544,0.020663
23,100663,University of Alabama at Birmingham,Administration Bldg Suite 1070,Alabama,Southeast,South,Public,No,Yes,No,...,45,0.002871,50,0.003190,463,0.029537,268,0.017097,195,0.012440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14516,490805,Purdue University Northwest,2200 169th Street,Indiana,Great Lakes,Midwest,Public,No,No,No,...,60,0.006016,53,0.005314,155,0.015542,117,0.011732,38,0.003810
14521,490805,Purdue University Northwest,2200 169th Street,Indiana,Great Lakes,Midwest,Public,No,No,No,...,10,0.009372,19,0.017807,196,0.183693,114,0.106842,82,0.076851
14538,495767,The Pennsylvania State University,201 Old Main,Pennsylvania,Mid East,Northeast,Public,No,Yes,No,...,1407,0.013940,1298,0.012860,10632,0.105336,6663,0.066013,3969,0.039323
14539,495767,The Pennsylvania State University,201 Old Main,Pennsylvania,Mid East,Northeast,Public,No,Yes,No,...,1131,0.013790,1021,0.012448,7046,0.085908,4543,0.055390,2503,0.030518


In [None]:
final_completion

Unnamed: 0,unitid,iuniversity,cipcode,cdegree_level,ctotal,cmen,% cmen,cwomen,% cwomen,caian,...,cunknown_men,% cunknown_men,cunknown_women,% cunknown_women,cnonresident,% cnonresident,cnonresident_men,% cnonresident_men,cnonresident_women,% cnonresident_women
0,100654,Alabama A & M University,14.0801,Bachelor's degree,7,4,0.571429,3,0.428571,0,...,0,0.000000,0,0.000000,1,0.142857,1,0.142857,0,0.000000
1,100654,Alabama A & M University,14.1001,Bachelor's degree,20,15,0.750000,5,0.250000,0,...,1,0.050000,1,0.050000,1,0.050000,1,0.050000,0,0.000000
2,100654,Alabama A & M University,14.1901,Bachelor's degree,41,29,0.707317,12,0.292683,0,...,2,0.048780,0,0.000000,0,0.000000,0,0.000000,0,0.000000
3,100654,Alabama A & M University,14.9999,Master's degree,34,14,0.411765,20,0.588235,0,...,3,0.088235,3,0.088235,2,0.058824,1,0.029412,1,0.029412
4,100663,University of Alabama at Birmingham,14.0101,Master's degree,143,115,0.804196,28,0.195804,1,...,14,0.097902,1,0.006993,4,0.027972,3,0.020979,1,0.006993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7177,495767,The Pennsylvania State University,14.3601,Associate's degree,0,0,,0,,0,...,0,,0,,0,,0,,0,
7178,495767,The Pennsylvania State University,14.3601,Master's degree,22,19,0.863636,3,0.136364,0,...,0,0.000000,0,0.000000,1,0.045455,1,0.045455,0,0.000000
7180,495767,The Pennsylvania State University,14.3801,Bachelor's degree,4,4,1.000000,0,0.000000,0,...,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000
7181,495767,The Pennsylvania State University,14.9999,Bachelor's degree,68,61,0.897059,7,0.102941,0,...,0,0.000000,0,0.000000,21,0.308824,17,0.250000,4,0.058824


In [None]:
# Save the final_enrollment & final_completion dataframes as csvs
final_enrollment.to_csv('/content/drive/MyDrive/enroll_test.csv', encoding = "utf-8", index=False)
final_completion.to_csv('/content/drive/MyDrive/complete_test.csv', encoding = "utf-8", index=False)