# Capstone Project Working Notebook: David

In [1]:
# regular imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import math

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings("ignore")

# Wrangling
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.cluster import KMeans
from scipy import stats
import sklearn.preprocessing
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr, kruskal

import csv
import ds_acquire
import ds_prepare


In [2]:
df = ds_acquire.get_bach_df()
df = ds_prepare.clean_col_names(df)
df = ds_prepare.clean_bach_df(df)

dataframe shape: (71901, 115)
modified df shape: (71901, 100)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71901 entries, 0 to 224838
Data columns (total 100 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   unit_id_institution                               69674 non-null  Int32  
 1   college_name                                      71901 non-null  object 
 2   state_post_code                                   69012 non-null  object 
 3   zip_code                                          69012 non-null  object 
 4   city                                              69012 non-null  object 
 5   region_ipeds                                      69012 non-null  float64
 6   title_IV_eligibility                              69012 non-null  float64
 7   pred_degree                                       69012 non-null  float64
 8   pred_degree_0and4                                 68685 non-null  float64
 9   degree_name    

In [15]:
df.major_name.value_counts()

Business Administration, Management and Operations.           1870
Psychology, General.                                          1565
Biology, General.                                             1468
Liberal Arts and Sciences, General Studies and Humanities.    1434
English Language and Literature, General.                     1409
                                                              ... 
Construction Trades, Other.                                      1
Podiatric Medicine/Podiatry.                                     1
Physics and Astronomy.                                           1
Advanced/Graduate Dentistry and Oral Sciences.                   1
Social Psychology.                                               1
Name: major_name, Length: 375, dtype: int64

In [8]:
df.state_post_code.value_counts()

NY    4829
PA    4709
CA    4195
OH    3616
TX    3456
IL    2524
MA    2346
FL    2253
MI    2233
IN    2068
NC    1986
MO    1907
GA    1671
TN    1644
WI    1630
VA    1622
MN    1616
IA    1387
NJ    1202
SC    1177
WA    1164
CT    1152
KS    1099
KY    1072
OK    1071
AL    1057
PR     963
MD     961
OR     875
LA     859
CO     858
AR     814
NE     775
AZ     733
UT     702
WV     644
MS     584
ME     556
VT     537
NH     509
SD     486
ID     466
RI     435
DC     417
ND     398
NM     374
MT     335
HI     262
NV     249
DE     236
AK     153
WY      71
GU      37
VI      28
MP       6
AS       1
FM       1
MH       1
Name: state_post_code, dtype: int64

In [5]:
df.shape

(71901, 100)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,ACT_score_mid,admission_rate,required_score,avg_faculty_salary,comp_rt_ft_150over_expected_time_two_races,comp_rt_ft_150over_expected_time_native_american,comp_rt_ft_150over_expected_time_asian,comp_rt_ft_150over_expected_time_black,comp_rt_ft_150over_expected_time_hispanic,...,enrollment_share_black,enrollment_share_hispanic,enrollment_share_pac_islander,enrollment_share_non_resident,enrollment_share_unknown,enrollment_share_white,non_deg_seeking,unit_id_institution,not_completed_med_debt,zip_code
0,0,18.0,0.9,1.0,7101.0,0.25,,,0.27,0.25,...,0.91,0.01,0.0,0.01,0.04,0.02,153.0,100654.0,10250,35762
1,1,18.0,0.9,1.0,7101.0,0.25,,,0.27,0.25,...,0.91,0.01,0.0,0.01,0.04,0.02,153.0,100654.0,10250,35762
2,2,18.0,0.9,1.0,7101.0,0.25,,,0.27,0.25,...,0.91,0.01,0.0,0.01,0.04,0.02,153.0,100654.0,10250,35762
3,5,18.0,0.9,1.0,7101.0,0.25,,,0.27,0.25,...,0.91,0.01,0.0,0.01,0.04,0.02,153.0,100654.0,10250,35762
4,6,18.0,0.9,1.0,7101.0,0.25,,,0.27,0.25,...,0.91,0.01,0.0,0.01,0.04,0.02,153.0,100654.0,10250,35762


In [6]:
df.EARN_NE_MDN_3YR.value_counts()

AttributeError: 'DataFrame' object has no attribute 'EARN_NE_MDN_3YR'

In [None]:
df.NPT4_PUB.value_counts()

30,996.00    521
22,682.00    453
17,479.00    442
28,758.00    428
11,644.00    415
            ... 
29,652.00      1
16,054.00      1
10,604.00      1
 5,533.00      1
15,476.00      1
Name: NPT4_PUB, Length: 1784, dtype: int64

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,UNITID,OPEID6_x,INSTNM_x,CONTROL_x,MAIN_x,CIPCODE,CIPDESC,CREDLEV,CREDDESC,...,COUNT_WNE_MALE1_P8,MD_EARN_WNE_MALE1_P8,GT_THRESHOLD_P10,MD_EARN_WNE_INC1_P10,MD_EARN_WNE_INC2_P10,MD_EARN_WNE_INC3_P10,MD_EARN_WNE_INDEP1_P10,MD_EARN_WNE_INDEP0_P10,MD_EARN_WNE_MALE0_P10,MD_EARN_WNE_MALE1_P10
0,0,100654.0,1002,Alabama A & M University,Public,1,100,"Agriculture, General.",3,Bachelors Degree,...,834.0,36639.0,0.6,34076.0,35597.0,43145.0,40299.0,35424.0,36050.0,36377.0
1,1,100654.0,1002,Alabama A & M University,Public,1,109,Animal Sciences.,3,Bachelors Degree,...,834.0,36639.0,0.6,34076.0,35597.0,43145.0,40299.0,35424.0,36050.0,36377.0
2,2,100654.0,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,3,Bachelors Degree,...,834.0,36639.0,0.6,34076.0,35597.0,43145.0,40299.0,35424.0,36050.0,36377.0
3,3,100654.0,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,5,Master's Degree,...,834.0,36639.0,0.6,34076.0,35597.0,43145.0,40299.0,35424.0,36050.0,36377.0
4,4,100654.0,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,6,Doctoral Degree,...,834.0,36639.0,0.6,34076.0,35597.0,43145.0,40299.0,35424.0,36050.0,36377.0


In [None]:
# Breaking main df into simplified dfs by functionality
roi_df = initial_prepare.roi_df(df)
general_df = initial_prepare.general_df(df)
loan_df = initial_prepare.loan_df(df)


KeyError: ('MD_EARN_1YR', 'MD_EARN_2YR', 'MD_EARN_3YR', 'NPT4_PUB', 'NPT4_PRIV', 'NPT4_PROG', 'NPT4_OTHER', 'COSTT4_A', 'COSTT4_P', 'TUITIONFEE_IN', 'TUITIONFEE_OUT', 'PCTPELL', 'PCTFLOAN', 'BOOKSUPPLY', 'ROOMBOARD_ON', 'OTHEREXPENSE_ON', 'ROOMBOARD_OFF', 'OTHEREXPENSE_OFF', 'OTHEREXPENSE_FAM')

In [None]:
roi_df.shape

(224849, 21)

In [None]:
roi_df.head()

Unnamed: 0,UNITID,INSTNM_x,CITY,EARN_MDN_HI_1YR,EARN_MDN_HI_2YR,NPT4_PUB,NPT4_PRIV,NPT4_PROG,NPT4_OTHER,COSTT4_A,...,TUITIONFEE_IN,TUITIONFEE_OUT,PCTPELL,PCTFLOAN,BOOKSUPPLY,ROOMBOARD_ON,OTHEREXPENSE_ON,ROOMBOARD_OFF,OTHEREXPENSE_OFF,OTHEREXPENSE_FAM
0,100654,Alabama A & M University,Normal,PrivacySuppressed,,14444.0,,,,22489.0,...,9744.0,18354.0,0.7067,0.7503,1600.0,9128.0,2990.0,9128.0,2990.0,2990.0
1,100654,Alabama A & M University,Normal,PrivacySuppressed,,14444.0,,,,22489.0,...,9744.0,18354.0,0.7067,0.7503,1600.0,9128.0,2990.0,9128.0,2990.0,2990.0
2,100654,Alabama A & M University,Normal,PrivacySuppressed,,14444.0,,,,22489.0,...,9744.0,18354.0,0.7067,0.7503,1600.0,9128.0,2990.0,9128.0,2990.0,2990.0
3,100654,Alabama A & M University,Normal,PrivacySuppressed,,14444.0,,,,22489.0,...,9744.0,18354.0,0.7067,0.7503,1600.0,9128.0,2990.0,9128.0,2990.0,2990.0
4,100654,Alabama A & M University,Normal,PrivacySuppressed,,14444.0,,,,22489.0,...,9744.0,18354.0,0.7067,0.7503,1600.0,9128.0,2990.0,9128.0,2990.0,2990.0


In [None]:
roi_df.EARN_MDN_HI_1YR.value_counts()

PrivacySuppressed    170325
36958                    37
25606                    35
28574                    35
42593                    35
                      ...  
61481                     1
117773                    1
7704                      1
54765                     1
16994                     1
Name: EARN_MDN_HI_1YR, Length: 32073, dtype: int64

In [None]:
roi_df.EARN_MDN_HI_2YR.isnull().sum()

224849

In [None]:
roi_df

Unnamed: 0,UNITID,INSTNM_x,CITY,EARN_MDN_HI_1YR,EARN_MDN_HI_2YR,NPT4_PUB,NPT4_PRIV,NPT4_PROG,NPT4_OTHER,COSTT4_A,...,TUITIONFEE_IN,TUITIONFEE_OUT,PCTPELL,PCTFLOAN,BOOKSUPPLY,ROOMBOARD_ON,OTHEREXPENSE_ON,ROOMBOARD_OFF,OTHEREXPENSE_OFF,OTHEREXPENSE_FAM
0,100654,Alabama A & M University,Normal,PrivacySuppressed,,14444.0,,,,22489.0,...,9744.0,18354.0,0.7067,0.7503,1600.0,9128.0,2990.0,9128.0,2990.0,2990.0
1,100654,Alabama A & M University,Normal,PrivacySuppressed,,14444.0,,,,22489.0,...,9744.0,18354.0,0.7067,0.7503,1600.0,9128.0,2990.0,9128.0,2990.0,2990.0
2,100654,Alabama A & M University,Normal,PrivacySuppressed,,14444.0,,,,22489.0,...,9744.0,18354.0,0.7067,0.7503,1600.0,9128.0,2990.0,9128.0,2990.0,2990.0
3,100654,Alabama A & M University,Normal,PrivacySuppressed,,14444.0,,,,22489.0,...,9744.0,18354.0,0.7067,0.7503,1600.0,9128.0,2990.0,9128.0,2990.0,2990.0
4,100654,Alabama A & M University,Normal,PrivacySuppressed,,14444.0,,,,22489.0,...,9744.0,18354.0,0.7067,0.7503,1600.0,9128.0,2990.0,9128.0,2990.0,2990.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224844,,California Southern University,,PrivacySuppressed,,,,,,,...,,,,,,,,,,
224845,,California Southern University,,PrivacySuppressed,,,,,,,...,,,,,,,,,,
224846,,Zion Massage College,,PrivacySuppressed,,,,,,,...,,,,,,,,,,
224847,,Oregon Coast Community College,,PrivacySuppressed,,,,,,,...,,,,,,,,,,
