In [1]:
# regular imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import math

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings("ignore")

# Wrangling
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.cluster import KMeans
from scipy import stats
import sklearn.preprocessing
from sklearn.metrics import mean_squared_error
from scipy import stats
from scipy.stats import pearsonr, spearmanr, kruskal
from scipy.stats.mstats import winsorize
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


import csv
import acquire
import prepare
import explore

In [2]:
df = acquire.get_bach_df()
df = prepare.clean_college_df(df)

dataframe shape: (71901, 139)


In [3]:
cleaned_df = prepare.clean_step1(df)
new_df = prepare.avg_net_price(cleaned_df)
new_df['major_category'] = new_df.major_name.apply(prepare.categorize_major)

In [4]:
# Ensure you have `2017_2018_2019_earning_by_major.csv` within working folder

new_df = prepare.earnings_merge(new_df)
new_df = prepare.create_roi_cols(new_df)

In [5]:
new_df.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,2017,2018,2019,Grand Total,roi_5yr,pct_roi_5yr,roi_10yr,pct_roi_10yr,roi_20yr,pct_roi_20yr
0,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,57605.69,55517.87,61388.93,174512.49,0.41,40.62,1.99,199.0,5.67,566.93
1,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,57605.69,55517.87,61388.93,174512.49,0.41,40.62,1.99,199.0,5.67,566.93
2,100858.0,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,3.0,...,57605.69,55517.87,61388.93,174512.49,0.2,19.89,1.55,154.92,4.69,468.62
3,100858.0,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,3.0,...,57605.69,55517.87,61388.93,174512.49,0.2,19.89,1.55,154.92,4.69,468.62
4,101541.0,Judson College,"Private, nonprofit",AL,36756,Marion,5.0,1.0,3.0,3.0,...,57605.69,55517.87,61388.93,174512.49,0.35,35.12,1.87,187.32,5.41,540.88


In [6]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68546 entries, 0 to 68545
Columns: 149 entries, unit_id_institution to pct_roi_20yr
dtypes: float64(114), int64(2), object(33)
memory usage: 78.4+ MB


In [7]:
explore_df = new_df

In [8]:
explore_df.shape

(68546, 149)

In [11]:
income_0_30000, income_30001_48000, income_48001_75000, income_75001_110000, income_over_110000 = explore.create_merged_income_brackets()

In [14]:
explore_df = explore.get_fam_income_col(explore_df, income_0_30000, "fam_income_0_30000")
explore_df = explore.get_fam_income_col(explore_df, income_30001_48000, "fam_income_30001_48000")
explore_df = explore.get_fam_income_col(explore_df, income_48001_75000, "fam_income_48001_75000")
explore_df = explore.get_fam_income_col(explore_df, income_75001_110000, "fam_income_75001_110000")
explore_df = explore.get_fam_income_col(explore_df, income_over_110000, "fam_income_over_110000")

In [15]:
explore_df.shape

(68546, 134)

In [16]:
# Standard train_test_split creating our 3 split dfs
train, validate, test = prepare.split_data(explore_df)
print(train.shape)
print(validate.shape)
print(test.shape)

(38385, 134)
(16451, 134)
(13710, 134)


In [17]:
# Capping outliers on train df
train = explore.percentile_capping(train, 0.1, 0.1)

In [19]:
train_imputed = explore.train_iterative_imputer(train)

In [20]:
train_imputed.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,pct_roi_5yr,roi_10yr,pct_roi_10yr,roi_20yr,pct_roi_20yr,fam_income_0_30000,fam_income_30001_48000,fam_income_48001_75000,fam_income_75001_110000,fam_income_over_110000
25656,168847.0,Baker College,"Private, nonprofit",MI,48867,Owosso,3.0,1.0,3.0,3.0,...,36.68,1.87,186.52,5.35,534.6,203.0,83.0,72.0,52.0,29.0
28625,216852.0,Widener University,"Private, nonprofit",PA,19013-5792,Chester,2.0,1.0,3.0,3.0,...,37.91,1.94,193.77,5.56,555.86,79.0,64.0,100.0,137.0,269.0
36032,199272.0,William Peace University,"Private, nonprofit",NC,27604-1194,Raleigh,5.0,1.0,3.0,3.0,...,15.08,1.44,143.99,4.43,443.45,57.0,28.0,27.0,33.0,36.0
30254,215655.0,Robert Morris University,"Private, nonprofit",PA,15108-1189,Moon Township,2.0,1.0,3.0,3.0,...,-5.4,1.0,100.32,3.46,345.63,109.0,86.0,148.0,152.0,226.0
68185,147767.0,Northwestern University,"Private, nonprofit",IL,60208,Evanston,3.0,1.0,3.0,3.0,...,16.11,1.48,147.89,4.54,454.0,50.0,90.0,146.0,138.0,351.0


In [21]:
validate_imputed, test_imputed = explore.impute_val_and_test(train, validate, test)

In [22]:
print(train_imputed.shape)
print(validate_imputed.shape)
print(test_imputed.shape)

(38385, 134)
(16451, 134)
(13710, 134)
