In [1]:
%matplotlib inline
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 150

# diasbling warning messages
import warnings
warnings.filterwarnings("ignore")

# notebook dependencies
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

#visualizations
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = "darkgrid")

# math modules
from math import sqrt
import scipy.stats as stats

import acquire
import prepare

# sklearn data science library
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.metrics import mean_squared_error
from scipy.stats.mstats import winsorize

from sklearn.impute import IterativeImputer
from sklearn.cluster import KMeans

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.decomposition import PCA 
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

In [2]:
# acquire the initial dataset
df = acquire.get_bach_df()

# clean and prep initial dataset
df = prepare.clean_college_df(df)
df = prepare.clean_high_percentage_nulls(df)

# pull-in target variables
# ensure you have `2017_2018_2019_earning_by_major.csv` within working folder
df = prepare.obtain_target_variables(df)

df.head()

dataframe shape: (71901, 139)
dataframe shape: (69012, 117)
dataframe shape: (68546, 127)


Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,degree_name,degree_code,major_code,major_name,branch_number,avg_net_price_public,avg_net_price_private,pub_fam_income_0_30000,private_fam_income_0_30000,program_fam_income_0_30000,other_fam_income_0_30000,pub_fam_income_30001_48000,private_fam_income_30001_48000,program_fam_income_30001_48000,other_fam_income_30001_48000,pub_fam_income_48001_75000,private_fam_income_48001_75000,program_fam_income_48001_75000,other_fam_income_48001_75000,pub_fam_income_75001_110000,private_fam_income_75001_110000,program_fam_income_75001_110000,other_fam_income_75001_110000,pub_fam_income_over_110000,private_fam_income_over_110000,program_fam_income_over_110000,other_fam_income_over_110000,full_time_net_tuition_revenue,off_campus_cost_of_attendace,on_campus_cost_of_attendace,admission_rate,graduate_number,ACT_score_mid,avg_sat_admitted,required_score,avg_faculty_salary,online_only,comp_rt_ft_150over_expected_time,comp_rt_ft_150over_expected_time_two_races,comp_rt_ft_150over_expected_time_native_american,comp_rt_ft_150over_expected_time_asian,comp_rt_ft_150over_expected_time_black,comp_rt_ft_150over_expected_time_hispanic,comp_rt_ft_150over_expected_time_non_resident,comp_rt_ft_150over_expected_time_unknown_race,comp_rt_ft_150over_expected_time_white,share_entering_students_first_ft,share_of_part_time,first_time_ft_student_retention,first_time_pt_student_retention,enrollment_share_two_races,enrollment_share_native_american,enrollment_share_asian,enrollment_share_black,enrollment_share_hispanic,enrollment_share_pac_islander,enrollment_share_non_resident,enrollment_share_unknown,enrollment_share_white,undergraduate_number_pell_grant_fedral_loan,med_debt_pell_students,median_debt_completed,median_debt_non_first_generation,median_debt_non_pell,fedral_loan_full_time_first_time_undergraduate,pell_grant_full_time_first_time_undergraduate,deg_percent_awarded_agriculture_operations,deg_percent_awarded_natural_resources,deg_percent_awarded_architecture,deg_percent_awarded_area_ethnic_cultural_gender,deg_percent_awarded_communication_journalism,deg_percent_awarded_communication_tech,deg_percent_awarded_computer_science,deg_percent_awarded_personal_culinary_services,deg_percent_awarded_education,deg_percent_awarded_engineering,deg_percent_awarded_engineering_tech,deg_percent_awarded_foreign_language_literatures,deg_percent_awarded_human_science,deg_percent_awarded_legal_profession,deg_percent_awarded_english_lang,deg_percent_awarded_general_studies,deg_percent_awarded_library_sciences,deg_percent_awarded_bio_sciences,deg_percent_awarded_mathematics,deg_percent_awarded_military_tech,deg_percent_awarded_intedisciplinary_studies,deg_percent_awarded_leisure_fitness,deg_percent_awarded_philosophy,deg_percent_awarded_theology,deg_percent_awarded_physical_sciences,deg_percent_awarded_science_tech,deg_percent_awarded_psychology,deg_percent_awarded_homeland_security,deg_percent_awarded_public_admin,deg_percent_awarded_social_sciences,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking,avg_net_price,major_category,2017,2018,2019,Grand Total,roi_5yr,pct_roi_5yr,roi_10yr,pct_roi_10yr,roi_20yr,pct_roi_20yr
0,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,Bachelors Degree,3,100,"Agriculture, General.",1.0,14444.0,0.0,375.0,,,,113.0,,,,61.0,,,,27.0,,,,16.0,,,,7413.0,9128.0,9128.0,0.8986,963.0,18.0,957.0,1.0,7101.0,0.0,0.2685,0.25,,,0.2681,0.25,,0.375,0.25,0.8987,0.0587,0.6087,1.0,0.0118,0.0022,0.0018,0.912,0.0088,0.0016,0.007,0.0361,0.0186,5039.0,17500,33375,16421,10250,0.7143,0.7057,0.0394,0.0237,0.0039,0.0,0.0,0.0394,0.0592,0.0,0.071,0.1183,0.0197,0.0,0.0394,0.0,0.0158,0.0473,0.0,0.0927,0.0059,0.0,0.0,0.002,0.0,0.0,0.0355,0.0,0.0631,0.0572,0.0493,0.0355,0.0,0.0,0.0,0.0,0.0237,0.0,0.1578,0.0,153.0,14444.0,Agriculture,57605.6889,55517.8694,61388.9338,174512.4921,0.4062,40.6166,1.99,198.9979,5.6693,566.9295
1,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,Bachelors Degree,3,199,"Agriculture, Agriculture Operations, and Relat...",1.0,14444.0,0.0,375.0,,,,113.0,,,,61.0,,,,27.0,,,,16.0,,,,7413.0,9128.0,9128.0,0.8986,963.0,18.0,957.0,1.0,7101.0,0.0,0.2685,0.25,,,0.2681,0.25,,0.375,0.25,0.8987,0.0587,0.6087,1.0,0.0118,0.0022,0.0018,0.912,0.0088,0.0016,0.007,0.0361,0.0186,5039.0,17500,33375,16421,10250,0.7143,0.7057,0.0394,0.0237,0.0039,0.0,0.0,0.0394,0.0592,0.0,0.071,0.1183,0.0197,0.0,0.0394,0.0,0.0158,0.0473,0.0,0.0927,0.0059,0.0,0.0,0.002,0.0,0.0,0.0355,0.0,0.0631,0.0572,0.0493,0.0355,0.0,0.0,0.0,0.0,0.0237,0.0,0.1578,0.0,153.0,14444.0,Agriculture,57605.6889,55517.8694,61388.9338,174512.4921,0.4062,40.6166,1.99,198.9979,5.6693,566.9295
2,100858.0,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,3.0,Bachelors Degree,3,100,"Agriculture, General.",1.0,23696.0,0.0,254.0,,,,155.0,,,,166.0,,,,201.0,,,,364.0,,,,15908.0,13332.0,13332.0,0.7543,5812.0,28.0,1289.0,1.0,10428.0,0.0,0.7812,,0.5,0.7283,0.5837,0.75,0.68,0.6952,0.8047,0.7973,0.0791,0.8999,0.7458,0.0225,0.0038,0.0236,0.055,0.0342,0.0005,0.049,0.0036,0.8078,23964.0,19000,21281,17500,15000,0.3039,0.1409,0.0414,0.0202,0.0181,0.0,0.0566,0.0,0.0098,0.0,0.0602,0.1911,0.0,0.0062,0.0348,0.0,0.0102,0.0,0.0,0.1108,0.0058,0.0,0.02,0.0,0.0021,0.0,0.0092,0.0,0.0279,0.0,0.016,0.0337,0.0,0.0,0.0,0.0002,0.0296,0.0623,0.2251,0.0087,481.0,23696.0,Agriculture,57605.6889,55517.8694,61388.9338,174512.4921,0.1989,19.8891,1.5492,154.9242,4.6862,468.621
3,100858.0,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,3.0,Bachelors Degree,3,103,Agricultural Production Operations.,1.0,23696.0,0.0,254.0,,,,155.0,,,,166.0,,,,201.0,,,,364.0,,,,15908.0,13332.0,13332.0,0.7543,5812.0,28.0,1289.0,1.0,10428.0,0.0,0.7812,,0.5,0.7283,0.5837,0.75,0.68,0.6952,0.8047,0.7973,0.0791,0.8999,0.7458,0.0225,0.0038,0.0236,0.055,0.0342,0.0005,0.049,0.0036,0.8078,23964.0,19000,21281,17500,15000,0.3039,0.1409,0.0414,0.0202,0.0181,0.0,0.0566,0.0,0.0098,0.0,0.0602,0.1911,0.0,0.0062,0.0348,0.0,0.0102,0.0,0.0,0.1108,0.0058,0.0,0.02,0.0,0.0021,0.0,0.0092,0.0,0.0279,0.0,0.016,0.0337,0.0,0.0,0.0,0.0002,0.0296,0.0623,0.2251,0.0087,481.0,23696.0,Agriculture,57605.6889,55517.8694,61388.9338,174512.4921,0.1989,19.8891,1.5492,154.9242,4.6862,468.621
4,101541.0,Judson College,"Private, nonprofit",AL,36756,Marion,5.0,1.0,3.0,3.0,Bachelors Degree,3,105,Agricultural and Domestic Animal Services.,1.0,0.0,16619.0,,20.0,,,,10.0,,,,10.0,,,,10.0,,,,8.0,,,8288.0,7700.0,10590.0,0.482,,20.0,1054.0,1.0,6076.0,0.0,0.3467,0.3333,,,0.2353,1.0,,,0.3704,0.7209,0.1622,0.6154,0.0,0.0039,0.0232,0.0039,0.2008,0.0232,0.0039,0.0,0.0425,0.6988,315.0,16738,26000,25000,8584,0.7385,0.6,0.0571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0429,0.0,0.0,0.0,0.0,0.0,0.0429,0.0,0.0,0.0429,0.0143,0.0,0.0143,0.0,0.0429,0.0,0.0143,0.0,0.1571,0.0429,0.0714,0.0,0.0,0.0,0.0,0.0,0.0714,0.2429,0.1,0.0429,9.0,16619.0,Agriculture,57605.6889,55517.8694,61388.9338,174512.4921,0.3512,35.1246,1.8732,187.3202,5.4088,540.8818


In [3]:
# collapse/create single family income columns 

df = prepare.create_fam_income_columns(df)

dataframe shape: (68546, 112)


In [4]:
# checking percentage of remaining nulls across features
null_percentages = df.isnull().mean().round(2)

with pd.option_context('display.max_rows', None):
    print(null_percentages)

unit_id_institution                                 0.00
college_name                                        0.00
institution_control                                 0.00
state_post_code                                     0.00
zip_code                                            0.00
city                                                0.00
region_ipeds                                        0.00
title_IV_eligibility                                0.00
pred_degree                                         0.00
pred_degree_0and4                                   0.00
degree_name                                         0.00
degree_code                                         0.00
major_code                                          0.00
major_name                                          0.00
branch_number                                       0.00
avg_net_price_public                                0.00
avg_net_price_private                               0.00
full_time_net_tuition_revenue  

In [5]:
# splitting the data

train, validate, test = prepare.split_data(df)

train shape: (38385, 112)
validate shape: (16451, 112)
test shape: (13710, 112)


In [6]:
# capping training df

train = prepare.percentile_capping(train, 0.1, 0.1)
train.shape

(38385, 112)

In [8]:
# capping validate and test dfs

validate = prepare.percentile_capping(validate, 0.1, 0.1)
test = prepare.percentile_capping(test, 0.1, 0.1)

In [9]:
validate.shape

(16451, 112)

In [10]:
test.shape
test.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,degree_name,degree_code,major_code,major_name,branch_number,avg_net_price_public,avg_net_price_private,full_time_net_tuition_revenue,off_campus_cost_of_attendace,on_campus_cost_of_attendace,admission_rate,graduate_number,ACT_score_mid,avg_sat_admitted,required_score,avg_faculty_salary,online_only,comp_rt_ft_150over_expected_time,comp_rt_ft_150over_expected_time_two_races,comp_rt_ft_150over_expected_time_native_american,comp_rt_ft_150over_expected_time_asian,comp_rt_ft_150over_expected_time_black,comp_rt_ft_150over_expected_time_hispanic,comp_rt_ft_150over_expected_time_non_resident,comp_rt_ft_150over_expected_time_unknown_race,comp_rt_ft_150over_expected_time_white,share_entering_students_first_ft,share_of_part_time,first_time_ft_student_retention,first_time_pt_student_retention,enrollment_share_two_races,enrollment_share_native_american,enrollment_share_asian,enrollment_share_black,enrollment_share_hispanic,enrollment_share_pac_islander,enrollment_share_non_resident,enrollment_share_unknown,enrollment_share_white,undergraduate_number_pell_grant_fedral_loan,med_debt_pell_students,median_debt_completed,median_debt_non_first_generation,median_debt_non_pell,fedral_loan_full_time_first_time_undergraduate,pell_grant_full_time_first_time_undergraduate,deg_percent_awarded_agriculture_operations,deg_percent_awarded_natural_resources,deg_percent_awarded_architecture,deg_percent_awarded_area_ethnic_cultural_gender,deg_percent_awarded_communication_journalism,deg_percent_awarded_communication_tech,deg_percent_awarded_computer_science,deg_percent_awarded_personal_culinary_services,deg_percent_awarded_education,deg_percent_awarded_engineering,deg_percent_awarded_engineering_tech,deg_percent_awarded_foreign_language_literatures,deg_percent_awarded_human_science,deg_percent_awarded_legal_profession,deg_percent_awarded_english_lang,deg_percent_awarded_general_studies,deg_percent_awarded_library_sciences,deg_percent_awarded_bio_sciences,deg_percent_awarded_mathematics,deg_percent_awarded_military_tech,deg_percent_awarded_intedisciplinary_studies,deg_percent_awarded_leisure_fitness,deg_percent_awarded_philosophy,deg_percent_awarded_theology,deg_percent_awarded_physical_sciences,deg_percent_awarded_science_tech,deg_percent_awarded_psychology,deg_percent_awarded_homeland_security,deg_percent_awarded_public_admin,deg_percent_awarded_social_sciences,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking,avg_net_price,major_category,2017,2018,2019,Grand Total,roi_5yr,pct_roi_5yr,roi_10yr,pct_roi_10yr,roi_20yr,pct_roi_20yr,income_0_30000,income_30001_48000,income_48001_75000,income_75001_110000,income_over_110000
32633,180258.0,University of Providence,"Private, nonprofit",MT,59405-4996,Great Falls,7.0,1.0,3.0,3.0,Bachelors Degree,3,4511,Sociology.,1.0,0.0,20355.0,10031.0,7900.0,9600.0,0.5636,162.0,24.2762,1166.2046,2.0,5795.0,0.0,0.458,,0.2,,0.3333,0.5,0.6667,0.5,0.4409,0.4651,0.3123,0.614,0.4444,0.0097,0.0106,0.0342,0.0445,0.0422,0.005,0.0205,0.0799,0.7614,930.0,17685,20710,9000,7675,0.587,0.3913,0.0,0.0,0.0,0.0,0.0,0.0,0.0038,0.0,0.0577,0.0,0.0,0.0,0.0,0.0079,0.0,0.0269,0.0,0.0577,0.0231,0.0,0.0,0.0308,0.0038,0.0,0.0,0.0,0.0308,0.0692,0.0,0.0038,0.0,0.0,0.0,0.0,0.0,0.3278,0.1231,0.0038,2615.0,20355.0,Social Sciences,69607.0267,70888.9152,74486.2376,214982.1795,0.5501,55.0078,2.2806,228.0644,6.3009,630.0877,24.0,13.0,13.0,13.0,11.0
47762,215655.0,Robert Morris University,"Private, nonprofit",PA,15108-1189,Moon Township,2.0,1.0,3.0,3.0,Bachelors Degree,3,5203,Accounting and Related Services.,1.0,0.0,26027.0,19074.0,6652.0,13120.0,0.8413,890.0,25.0,1131.0,1.0,11873.0,0.0,0.6367,0.4667,0.6667,0.8333,0.375,0.8333,0.5111,0.4286,0.6801,0.7718,0.0759,0.8099,0.3333,0.0317,0.001,0.0111,0.0586,0.0278,0.0005,0.1046,0.0214,0.7419,4243.0,23250,27000,21500,22193,0.773,0.3145,0.0,0.0071,0.0,0.0,0.0357,0.0,0.0377,0.0,0.0102,0.1398,0.0,0.0,0.0,0.0,0.0061,0.0,0.0,0.0245,0.0265,0.0,0.0,0.0357,0.0,0.0,0.0,0.0,0.0326,0.0601,0.0,0.0173,0.0,0.0,0.0,0.0,0.0347,0.1549,0.3056,0.002,26.0,26027.0,Business,74813.9367,76724.0692,79608.3922,231146.3982,0.5176,51.7626,2.206,220.604,6.1284,612.8354,109.0,86.0,148.0,152.0,226.0
67126,236939.0,Washington State University,Public,WA,99164-5910,Pullman,8.0,1.0,3.0,3.0,Bachelors Degree,3,1101,Architectural Sciences and Technology.,2.0,15768.0,0.0,9648.0,11398.0,11398.0,0.7719,5380.0,23.0,1123.0,1.0,9888.0,0.0,0.5889,0.5773,0.3929,0.5397,0.4719,0.5102,0.6729,0.5938,0.6157,0.6418,0.1114,0.7935,0.4773,0.0619,0.0063,0.0606,0.0322,0.1528,0.0049,0.042,0.0258,0.6081,23167.0,15500,20000,14548,13657,0.4216,0.2969,0.029,0.0005,0.0058,0.0044,0.0714,0.0,0.0247,0.0,0.0264,0.1076,0.0,0.0042,0.0358,0.0,0.0177,0.0194,0.0,0.0621,0.008,0.0,0.0138,0.0308,0.0019,0.0,0.0182,0.0,0.0674,0.0299,0.0035,0.0987,0.0,0.0,0.0,0.0,0.0112,0.073,0.2003,0.0114,423.0,15768.0,Architecture,68643.5917,71344.2649,75609.81,215597.6665,0.6931,69.3097,2.5967,259.6705,7.0189,701.8932,554.0,308.0,310.0,291.0,500.0
66343,154855.0,Central Christian College of Kansas,"Private, nonprofit",KS,67460-5740,McPherson,4.0,1.0,3.0,3.0,Bachelors Degree,3,4001,Physical Sciences.,1.0,0.0,12131.0,11991.0,6652.0,8034.0,0.6506,,24.8084,1185.0,,5629.0,0.0,0.3694,0.3333,,1.0,0.2,0.4286,0.2941,0.2,0.5472,0.4811,0.0448,0.614,0.75,0.0284,0.0106,0.006,0.1868,0.1181,0.003,0.0553,0.0254,0.5546,849.0,14524,27000,14000,10645,0.7089,0.5823,0.0,0.0,0.0,0.0,0.0157,0.0,0.0,0.0,0.0262,0.0,0.0,0.0,0.0,0.0,0.0,0.0838,0.0,0.0157,0.0052,0.0,0.0,0.081,0.0,0.0126,0.0,0.0,0.0785,0.0831,0.0,0.0,0.0,0.0,0.0,0.0,0.0052,0.0733,0.2251,0.0,57.0,12131.0,Physical Sciences,60639.4142,60521.1408,62937.4603,184098.0152,0.5321,53.2071,2.2292,222.9189,6.1717,617.1721,51.0,22.0,26.0,11.0,19.0
65697,211352.0,Cabrini University,"Private, nonprofit",PA,19087-3698,Radnor,2.0,1.0,3.0,3.0,Bachelors Degree,3,1101,Area Studies.,1.0,0.0,26400.0,15245.0,10000.0,12500.0,0.7189,576.0,24.6577,1179.4013,5.0,6897.0,0.0,0.5703,0.6667,,0.3333,0.3519,0.4545,,0.5714,0.636,0.8511,0.0457,0.7114,1.0,0.0277,0.0013,0.0238,0.2161,0.1357,0.0006,0.0025,0.0534,0.5415,1686.0,18899,27000,15311,17500,0.813,0.4939,0.0,0.0,0.0,0.0101,0.0825,0.0,0.0147,0.0,0.1062,0.0,0.0,0.0,0.0,0.0,0.0446,0.0147,0.0,0.0513,0.011,0.0,0.0073,0.0476,0.011,0.0,0.011,0.0,0.0806,0.0,0.0256,0.1062,0.0,0.0,0.0,0.0,0.0256,0.0,0.2601,0.0273,152.0,26400.0,Interdisciplinary and Multi-Disciplinary Studi...,49517.4908,49623.8346,53218.4822,152359.8077,0.0005,0.0501,1.1228,112.2781,3.7299,372.9922,143.0,51.0,59.0,65.0,92.0


In [46]:
# train/transform train df using iterative imputer

train_imputed = prepare.train_iterative_imputer(train)
print(f'df shape: {train_imputed.shape}')

df shape: (38385, 112)


In [47]:
# check the head 

train_imputed.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,degree_name,degree_code,major_code,major_name,branch_number,avg_net_price_public,avg_net_price_private,full_time_net_tuition_revenue,off_campus_cost_of_attendace,on_campus_cost_of_attendace,admission_rate,graduate_number,ACT_score_mid,avg_sat_admitted,required_score,avg_faculty_salary,online_only,comp_rt_ft_150over_expected_time,comp_rt_ft_150over_expected_time_two_races,comp_rt_ft_150over_expected_time_native_american,comp_rt_ft_150over_expected_time_asian,comp_rt_ft_150over_expected_time_black,comp_rt_ft_150over_expected_time_hispanic,comp_rt_ft_150over_expected_time_non_resident,comp_rt_ft_150over_expected_time_unknown_race,comp_rt_ft_150over_expected_time_white,share_entering_students_first_ft,share_of_part_time,first_time_ft_student_retention,first_time_pt_student_retention,enrollment_share_two_races,enrollment_share_native_american,enrollment_share_asian,enrollment_share_black,enrollment_share_hispanic,enrollment_share_pac_islander,enrollment_share_non_resident,enrollment_share_unknown,enrollment_share_white,undergraduate_number_pell_grant_fedral_loan,med_debt_pell_students,median_debt_completed,median_debt_non_first_generation,median_debt_non_pell,fedral_loan_full_time_first_time_undergraduate,pell_grant_full_time_first_time_undergraduate,deg_percent_awarded_agriculture_operations,deg_percent_awarded_natural_resources,deg_percent_awarded_architecture,deg_percent_awarded_area_ethnic_cultural_gender,deg_percent_awarded_communication_journalism,deg_percent_awarded_communication_tech,deg_percent_awarded_computer_science,deg_percent_awarded_personal_culinary_services,deg_percent_awarded_education,deg_percent_awarded_engineering,deg_percent_awarded_engineering_tech,deg_percent_awarded_foreign_language_literatures,deg_percent_awarded_human_science,deg_percent_awarded_legal_profession,deg_percent_awarded_english_lang,deg_percent_awarded_general_studies,deg_percent_awarded_library_sciences,deg_percent_awarded_bio_sciences,deg_percent_awarded_mathematics,deg_percent_awarded_military_tech,deg_percent_awarded_intedisciplinary_studies,deg_percent_awarded_leisure_fitness,deg_percent_awarded_philosophy,deg_percent_awarded_theology,deg_percent_awarded_physical_sciences,deg_percent_awarded_science_tech,deg_percent_awarded_psychology,deg_percent_awarded_homeland_security,deg_percent_awarded_public_admin,deg_percent_awarded_social_sciences,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking,avg_net_price,major_category,2017,2018,2019,Grand Total,roi_5yr,pct_roi_5yr,roi_10yr,pct_roi_10yr,roi_20yr,pct_roi_20yr,income_0_30000,income_30001_48000,income_48001_75000,income_75001_110000,income_over_110000
25656,168847.0,Baker College,"Private, nonprofit",MI,48867,Owosso,3.0,1.0,3.0,3.0,Bachelors Degree,3.0,2401.0,"Liberal Arts and Sciences, General Studies and...",2.0,0.0,12329.0,9932.0,7200.0,8029.0,0.8002,358.0,23.5303,1142.0209,2.0,5679.0,0.0,0.3217,0.2188,0.1,0.2857,0.1694,0.25,0.440496,0.2222,0.359,0.4595,0.3185,0.6154,0.438,0.0427,0.004,0.0151,0.1035,0.0386,0.0014,0.0029,0.0083,0.7864,11603.0,18723,24500,15500,13080,0.5894,0.6022,0.0028,0.0,0.0,0.0,0.0133,0.0004,0.0713,0.0,0.0111,0.0046,0.0154,0.0062,0.0354,0.0079,0.0,0.0102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0127,0.0287,0.0,0.0,0.0,0.0,0.0,0.0,0.0043,0.3227,0.265,0.0,655.0,12329.0,Liberal Arts and Humanities,53878.3275,56419.9476,55781.4792,166079.7543,0.3668,36.6816,1.8652,186.5177,5.346,534.5983,203.0,83.0,72.0,52.0,29.0
28625,216852.0,Widener University,"Private, nonprofit",PA,19013-5792,Chester,2.0,1.0,3.0,3.0,Bachelors Degree,3.0,4008.0,Physics.,1.0,0.0,29693.0,19740.0,8190.0,14446.0,0.692,3254.0,23.0,1115.0,1.0,8652.0,0.0,0.6335,0.375,1.0,0.5294,0.4272,0.4688,1.0,1.0,0.6897,0.844,0.0953,0.7753,0.1591,0.0381,0.0007,0.0331,0.1281,0.05,0.0,0.0203,0.0187,0.711,3436.0,25250,27000,23250,20322,0.8047,0.2458,0.0,0.0026,0.0,0.004,0.0356,0.0,0.0501,0.0,0.0264,0.1339,0.0,0.004,0.0,0.0079,0.0132,0.0184,0.0,0.0408,0.0053,0.0,0.0,0.0,0.0,0.0,0.0119,0.0,0.0896,0.0316,0.0211,0.0303,0.0,0.0,0.0,0.0,0.0026,0.2859,0.1621,0.0026,166.0,31305.0,Mathematics and Statistics,72585.5295,72475.0773,79448.6061,224509.2128,0.3791,37.9059,1.9377,193.7711,5.5586,555.8575,79.0,64.0,100.0,137.0,269.0
36032,199272.0,William Peace University,"Private, nonprofit",NC,27604-1194,Raleigh,5.0,1.0,3.0,3.0,Bachelors Degree,3.0,4301.0,Criminal Justice and Corrections.,1.0,0.0,24016.0,14497.0,7350.0,11100.0,0.5713,-1019.181354,21.0,1042.0,1.0,5849.0,0.0,0.3733,0.2188,0.4286,0.4286,0.3208,0.375,0.486883,0.2667,0.4568,0.6795,0.0715,0.6453,0.365036,0.0352,0.0079,0.0204,0.2581,0.1158,0.0011,0.0045,0.0704,0.4677,930.0,17500,25000,12500,12300,0.8079,0.532,0.0,0.0039,0.0,0.0,0.0829,0.0,0.0,0.0,0.0433,0.0,0.0,0.0,0.0,0.0039,0.0315,0.0551,0.0,0.0748,0.0,0.0,0.0157,0.0,0.0,0.0,0.0,0.0,0.1053,0.0,0.0,0.1024,0.0,0.0,0.0,0.0,0.063,0.0,0.3051,0.0,29.0,24016.0,Criminal Justice and Fire Protection,55074.8207,55066.1232,58901.8235,169042.7673,0.1508,15.0826,1.4399,143.9897,4.4345,443.4507,57.0,28.0,27.0,33.0,36.0
30254,215655.0,Robert Morris University,"Private, nonprofit",PA,15108-1189,Moon Township,2.0,1.0,3.0,3.0,Bachelors Degree,3.0,3105.0,Health and Physical Education/Fitness.,1.0,0.0,26027.0,19074.0,6650.0,13120.0,0.8413,890.0,25.0,1131.0,1.0,11873.0,0.0,0.6367,0.4667,0.6667,0.8333,0.375,0.8333,0.5111,0.4286,0.6801,0.7718,0.0759,0.8099,0.3333,0.0317,0.001,0.0111,0.0586,0.0287,0.0005,0.1043,0.0214,0.7419,4243.0,23250,27000,21500,22193,0.773,0.3145,0.0,0.0071,0.0,0.0,0.0357,0.0,0.0377,0.0,0.0102,0.1339,0.0,0.0,0.0,0.0,0.0061,0.0,0.0,0.0245,0.0265,0.0,0.0,0.0357,0.0,0.0,0.0,0.0,0.0326,0.0601,0.0,0.0173,0.0,0.0,0.0,0.0,0.0347,0.1549,0.3051,0.002,26.0,26027.0,"Physical Fitness, Parks, Recreation, and Leisure",46524.6177,47221.758,49787.2068,143533.5825,-0.0548,-5.4814,1.0011,100.1123,3.4541,345.6328,109.0,86.0,148.0,152.0,226.0
68185,147767.0,Northwestern University,"Private, nonprofit",IL,60208,Evanston,3.0,1.0,3.0,3.0,Bachelors Degree,3.0,4512.0,Urban Studies/Affairs.,1.0,0.0,24840.0,24489.0,14824.0,16626.0,0.3966,13485.0,29.0,1309.0,1.0,12035.0,0.0,0.8489,0.9333,0.5,0.9497,0.8594,0.8813,0.9023,0.8667,0.8786,0.9235,0.038,0.9416,0.731625,0.0568,0.0009,0.1279,0.0594,0.1252,0.0001,0.0921,0.0375,0.4546,8700.0,14500,15500,14500,14768,0.2923,0.1886,0.0,0.0048,0.0,0.0061,0.0829,0.0,0.0527,0.0,0.0039,0.1339,0.0,0.0136,0.0,0.0026,0.0127,0.0044,0.0,0.0882,0.0271,0.0,0.0057,0.0,0.0079,0.0,0.0136,0.0,0.0566,0.0,0.0373,0.1408,0.0,0.0,0.0,0.0,0.079,0.0132,0.0593,0.0154,191.0,24840.0,"Area, Ethnic, and Civilization Studies",53999.9345,56155.889,60997.5383,171153.3618,0.1611,16.1136,1.4789,147.886,4.54,454.0034,50.0,90.0,146.0,138.0,351.0


In [19]:
# inspect remaining nulls
remaining_nulls = train_imputed.isnull().sum()

with pd.option_context('display.max_rows', None):
    print(remaining_nulls) # zip codes still missing

unit_id_institution                                   0
college_name                                          0
institution_control                                   0
state_post_code                                       0
zip_code                                              0
city                                                  0
region_ipeds                                          0
title_IV_eligibility                                  0
pred_degree                                           0
pred_degree_0and4                                     0
degree_name                                           0
degree_code                                           0
major_code                                            0
major_name                                            0
branch_number                                         0
avg_net_price_public                                  0
avg_net_price_private                                 0
full_time_net_tuition_revenue                   

In [14]:
# impute missing values in validate and test 

validate_imputed, test_imputed = prepare.impute_val_and_test(train, validate, test)

print(f'validate shape: {validate_imputed.shape}')
print(f'test shape: {test_imputed.shape}')

validate shape: (16451, 112)
test shape: (13710, 112)


In [38]:
import cc_prepare
cc_prepare.nulls_by_col(train_imputed).head()

Unnamed: 0,num_rows_missing,percent_rows_missing
median_debt_non_pell,755,1.966914
med_debt_pell_students,755,1.966914
median_debt_non_first_generation,610,1.589162
median_debt_completed,547,1.425036
unit_id_institution,0,0.0


In [48]:
# impute the null with 0 for now
train_imputed = train_imputed.fillna(0)

In [40]:
# check null again
cc_prepare.nulls_by_col(train_imputed).head()

Unnamed: 0,num_rows_missing,percent_rows_missing
unit_id_institution,0,0.0
college_name,0,0.0
deg_percent_awarded_psychology,0,0.0
deg_percent_awarded_science_tech,0,0.0
deg_percent_awarded_physical_sciences,0,0.0


In [49]:
train_imputed.region_ipeds.value_counts()

2.0    10071
5.0     8647
3.0     6573
8.0     4389
4.0     4243
6.0     3099
7.0     1363
Name: region_ipeds, dtype: int64

In [50]:
train_imputed.region_ipeds = train_imputed.region_ipeds.astype(int)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == 3, 'midwest', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '2', 'northeast', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '5', 'southeast', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '0', 'northeast', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '1', 'northeast', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '4', 'midwest', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '6', 'southwest', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '7', 'west', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '8', 'west', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '9', 'territory', train_imputed.region_ipeds)

In [51]:
train_imputed.region_ipeds.value_counts()

midwest      10816
northeast    10071
southeast     8647
west          5752
southwest     3099
Name: region_ipeds, dtype: int64

In [52]:
train_imputed.columns.tolist()

['unit_id_institution',
 'college_name',
 'institution_control',
 'state_post_code',
 'zip_code',
 'city',
 'region_ipeds',
 'title_IV_eligibility',
 'pred_degree',
 'pred_degree_0and4',
 'degree_name',
 'degree_code',
 'major_code',
 'major_name',
 'branch_number',
 'avg_net_price_public',
 'avg_net_price_private',
 'full_time_net_tuition_revenue',
 'off_campus_cost_of_attendace',
 'on_campus_cost_of_attendace',
 'admission_rate',
 'graduate_number',
 'ACT_score_mid',
 'avg_sat_admitted',
 'required_score',
 'avg_faculty_salary',
 'online_only',
 'comp_rt_ft_150over_expected_time',
 'comp_rt_ft_150over_expected_time_two_races',
 'comp_rt_ft_150over_expected_time_native_american',
 'comp_rt_ft_150over_expected_time_asian',
 'comp_rt_ft_150over_expected_time_black',
 'comp_rt_ft_150over_expected_time_hispanic',
 'comp_rt_ft_150over_expected_time_non_resident',
 'comp_rt_ft_150over_expected_time_unknown_race',
 'comp_rt_ft_150over_expected_time_white',
 'share_entering_students_first_ft'

## cluster 1. basic info for school

In [None]:
'institution_control', 'state_post_code','title_IV_eligibility','region_ipeds','pred_degree','admission_rate',
''

In [None]:
state_dummies_train = pd.get_dummies(train['state_post_code'])
region_dummies_train = pd.get_dummies(train['region_ipeds'])
institution_control_dummies_train = pd.get_dummies(train['institution_control'])