In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import math

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings("ignore")

# Wrangling
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.cluster import KMeans
from scipy import stats
import sklearn.preprocessing
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr, kruskal
from sklearn.preprocessing import MinMaxScaler

import csv
import cc_acquire
import cc_prepare
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import cc_explore

In [4]:
df = cc_acquire.get_bach_df()
df = cc_prepare.clean_college_df(df)

cleaned_df = cc_prepare.clean_step1(df)
new_df = cc_prepare.avg_net_price(cleaned_df)
new_df['major_category'] = new_df.major_name.apply(cc_prepare.categorize_major)

# Ensure you have `2017_2018_2019_earning_by_major.csv` within working folder

new_df = cc_prepare.earnings_merge(new_df)
new_df = cc_prepare.create_roi_cols(new_df)

dataframe shape: (71901, 139)


In [5]:
df = new_df

In [6]:
# Creating income brackets vars
income_0_30000, income_30001_48000, income_48001_75000, income_75001_110000, income_over_110000 = cc_explore.create_merged_income_brackets()

In [7]:
# You can run this code as a single block
df = cc_explore.get_fam_income_col(df, income_0_30000, "fam_income_0_30000")
df = cc_explore.get_fam_income_col(df, income_30001_48000, "fam_income_30001_48000")
df = cc_explore.get_fam_income_col(df, income_48001_75000, "fam_income_48001_75000")
df = cc_explore.get_fam_income_col(df, income_75001_110000, "fam_income_75001_110000")
df = cc_explore.get_fam_income_col(df, income_over_110000, "fam_income_over_110000")

In [8]:
# Standard train_test_split creating our 3 split dfs
train, validate, test = cc_prepare.split_data(df)
print(train.shape)
print(validate.shape)
print(test.shape)

(38385, 134)
(16451, 134)
(13710, 134)


In [9]:
train['admission_rate'] = train.groupby('major_category')['admission_rate'].apply(lambda x:x.fillna(x.mean()))
train['ACT_score_mid'] = train.groupby('major_category')['ACT_score_mid'].apply(lambda x:x.fillna(x.mean()))
train['avg_sat_admitted'] = train.groupby('major_category')['avg_sat_admitted'].apply(lambda x:x.fillna(x.mean()))

In [10]:
cc_prepare.nulls_by_col(train).head()

Unnamed: 0,num_rows_missing,percent_rows_missing
med_parent_and_pell,37211,96.94
avg_parent_and_pell,36331,94.65
med_monthly_payment_parent_and_debt,34309,89.38
med_parent_and_loan,34309,89.38
avg_parent_and_loan,33218,86.54


In [11]:
train.shape

(38385, 134)

In [12]:
cols = ['med_parent_and_pell','avg_parent_and_pell','med_monthly_payment_parent_and_debt','med_parent_and_loan',
       'avg_parent_and_loan','avg_stafford_and_no_pell_recipients','avg_stafford_and_pell','med_stafford_and_pell',
       'med_stafford_and_no_pell_recipients','avg_stafford_and_debt','med_stafford_and_debt','med_stafford_and_grad_debt',
       'first_time_pt_student_retention','comp_rt_ft_150over_expected_time_native_american','comp_rt_ft_150over_expected_time_unknown_race',
       'comp_rt_ft_150over_expected_time_non_resident','comp_rt_ft_150over_expected_time_non_resident',
       'comp_rt_ft_150over_expected_time_two_races','comp_rt_ft_150over_expected_time_asian']
train = train.drop(columns = cols)

In [13]:
train.shape

(38385, 116)

In [14]:
cc_prepare.nulls_by_col(train).head(10)

Unnamed: 0,num_rows_missing,percent_rows_missing
title_IV_student_number,16998,44.28
graduate_number,5234,13.64
on_campus_cost_of_attendace,4000,10.42
required_score,3990,10.39
median_debt_independent,3248,8.46
non_deg_seeking,2870,7.48
comp_rt_ft_150over_expected_time_black,1907,4.97
comp_rt_ft_150over_expected_time_hispanic,1840,4.79
off_campus_cost_of_attendace,1667,4.34
comp_rt_ft_150over_expected_time_white,1540,4.01


In [17]:
train['title_IV_student_number'] = train.groupby('college_name')['title_IV_student_number'].apply(lambda x:x.fillna(x.mean()))
train['graduate_number'] = train.groupby('college_name')['graduate_number'].apply(lambda x:x.fillna(x.mean()))

In [18]:
cc_prepare.nulls_by_col(train).head(10)

Unnamed: 0,num_rows_missing,percent_rows_missing
title_IV_student_number,16958,44.18
graduate_number,5113,13.32
on_campus_cost_of_attendace,4000,10.42
required_score,3990,10.39
median_debt_independent,3248,8.46
non_deg_seeking,2870,7.48
comp_rt_ft_150over_expected_time_black,1907,4.97
comp_rt_ft_150over_expected_time_hispanic,1840,4.79
off_campus_cost_of_attendace,1667,4.34
comp_rt_ft_150over_expected_time_white,1540,4.01


In [36]:
train_imputed = cc_explore.train_iterative_imputer(train)
print(train_imputed.shape)

(38385, 116)


In [20]:
# check null after imputation
cc_prepare.nulls_by_col(train_imputed).head(5)

Unnamed: 0,num_rows_missing,percent_rows_missing
median_debt_independent,3248,8.46
median_debt_female,1057,2.75
median_debt_male,1057,2.75
median_debt_30001_75000,935,2.44
median_debt_75001+,927,2.42


In [21]:
train_imputed = train_imputed.fillna(0)

In [22]:
cc_prepare.nulls_by_col(train_imputed).head()

Unnamed: 0,num_rows_missing,percent_rows_missing
unit_id_institution,0,0.0
deg_percent_awarded_legal_profession,0,0.0
deg_percent_awarded_science_tech,0,0.0
deg_percent_awarded_physical_sciences,0,0.0
deg_percent_awarded_theology,0,0.0


### start building clusters!

In [28]:
train_imputed.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,pct_roi_5yr,roi_10yr,pct_roi_10yr,roi_20yr,pct_roi_20yr,fam_income_0_30000,fam_income_30001_48000,fam_income_48001_75000,fam_income_75001_110000,fam_income_over_110000
25656,168847.0,Baker College,"Private, nonprofit",MI,48867,Owosso,3.0,1.0,2.0,2.0,...,36.68,1.87,186.52,5.35,534.6,0.0,655.0,203.0,83.0,72.0
28625,216852.0,Widener University,"Private, nonprofit",PA,19013-5792,Chester,2.0,1.0,3.0,3.0,...,37.91,1.94,193.77,5.56,555.86,0.0,166.0,79.0,64.0,100.0
36032,199272.0,William Peace University,"Private, nonprofit",NC,27604-1194,Raleigh,5.0,1.0,3.0,3.0,...,15.08,1.44,143.99,4.43,443.45,0.0,29.0,57.0,28.0,27.0
30254,215655.0,Robert Morris University,"Private, nonprofit",PA,15108-1189,Moon Township,2.0,1.0,3.0,3.0,...,-5.48,1.0,100.11,3.45,345.41,0.0,26.0,109.0,86.0,148.0
68185,147767.0,Northwestern University,"Private, nonprofit",IL,60208,Evanston,3.0,1.0,3.0,3.0,...,16.11,1.48,147.89,4.54,454.0,0.02,191.0,50.0,90.0,146.0


In [56]:
train_imputed.region_ipeds.value_counts()

midwest      10816
northeast    10071
southeast     8647
west          5182
southwest     3099
territory      570
Name: region_ipeds, dtype: int64

In [37]:
train_imputed.region_ipeds = train_imputed.region_ipeds.astype(int)

In [59]:
train_imputed[train_imputed.state_post_code == 'GU'].head(10)

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,pct_roi_5yr,roi_10yr,pct_roi_10yr,roi_20yr,pct_roi_20yr,fam_income_0_30000,fam_income_30001_48000,fam_income_48001_75000,fam_income_75001_110000,fam_income_over_110000
49328,240754.0,University of Guam,Public,GU,96923,Mangilao,territory,1.0,3.0,3.0,...,105.81,3.35,334.79,8.67,866.72,0.28,0.02,0.0,46.0,109.0
25079,240754.0,University of Guam,Public,GU,96923,Mangilao,territory,1.0,3.0,3.0,...,51.07,2.19,218.89,6.09,608.77,0.28,0.02,0.0,46.0,109.0
66162,240754.0,University of Guam,Public,GU,96923,Mangilao,territory,1.0,3.0,3.0,...,36.46,1.9,189.53,5.45,545.13,0.28,0.02,0.0,46.0,109.0
19539,240754.0,University of Guam,Public,GU,96923,Mangilao,territory,1.0,3.0,3.0,...,12.14,1.36,136.15,4.24,424.25,0.28,0.02,0.0,46.0,109.0
55550,240754.0,University of Guam,Public,GU,96923,Mangilao,territory,1.0,3.0,3.0,...,56.9,2.36,235.97,6.52,651.96,0.28,0.02,0.0,46.0,109.0
15569,240754.0,University of Guam,Public,GU,96923,Mangilao,territory,1.0,3.0,3.0,...,134.64,3.97,397.31,10.08,1007.5,0.28,0.02,0.0,46.0,109.0
30569,240754.0,University of Guam,Public,GU,96923,Mangilao,territory,1.0,3.0,3.0,...,28.18,1.71,171.38,5.04,504.05,0.28,0.02,0.0,46.0,109.0
34771,240754.0,University of Guam,Public,GU,96923,Mangilao,territory,1.0,3.0,3.0,...,91.9,3.06,306.14,8.04,803.84,0.28,0.02,0.0,46.0,109.0
49329,240754.0,University of Guam,Public,GU,96923,Mangilao,territory,1.0,3.0,3.0,...,105.81,3.35,334.79,8.67,866.72,0.28,0.02,0.0,46.0,109.0
22884,240754.0,University of Guam,Public,GU,96923,Mangilao,territory,1.0,3.0,3.0,...,13.86,1.4,139.9,4.33,432.72,0.28,0.02,0.0,46.0,109.0


In [55]:
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == 3, 'midwest', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '2', 'northeast', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '5', 'southeast', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '0', 'northeast', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '1', 'northeast', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '4', 'midwest', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '6', 'southwest', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '7', 'west', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '8', 'west', train_imputed.region_ipeds)
train_imputed.region_ipeds = np.where(train_imputed.region_ipeds == '9', 'territory', train_imputed.region_ipeds)

In [25]:
train_imputed.columns.tolist()

['unit_id_institution',
 'college_name',
 'institution_control',
 'state_post_code',
 'zip_code',
 'city',
 'region_ipeds',
 'title_IV_eligibility',
 'pred_degree',
 'pred_degree_0and4',
 'degree_name',
 'degree_code',
 'major_code',
 'major_name',
 'branch_number',
 'avg_net_price_public',
 'avg_net_price_private',
 'title_IV_student_number',
 'full_time_net_tuition_revenue',
 'off_campus_cost_of_attendace',
 'on_campus_cost_of_attendace',
 'admission_rate',
 'graduate_number',
 'ACT_score_mid',
 'avg_sat_admitted',
 'required_score',
 'avg_faculty_salary',
 'online_only',
 'comp_rt_ft_150over_expected_time',
 'comp_rt_ft_150over_expected_time_black',
 'comp_rt_ft_150over_expected_time_hispanic',
 'comp_rt_ft_150over_expected_time_white',
 'share_entering_students_first_ft',
 'share_of_part_time',
 'first_time_ft_student_retention',
 'enrollment_share_two_races',
 'enrollment_share_native_american',
 'enrollment_share_asian',
 'enrollment_share_black',
 'enrollment_share_hispanic',
 '

In [57]:
state_dummies_train = pd.get_dummies(train['state_post_code'])
region_dummies_train = pd.get_dummies(train['region_ipeds'])
institution_control_dummies_train = pd.get_dummies(train['institution_control'])

### cluster 1. basic info for school

In [None]:
'institution_control', 'state_post_code','title_IV_eligibility','region_ipeds','pred_degree','admission_rate',
''