In [1]:
# regular imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import math

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings("ignore")

# Wrangling
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.cluster import KMeans
from scipy import stats
import sklearn.preprocessing
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr, kruskal

import csv
import cc_acquire
import cc_prepare

In [2]:
df = cc_acquire.get_bach_df()
df = cc_prepare.clean_col_names(df)
df = cc_prepare.clean_bach_df(df)

dataframe shape: (71901, 115)
modified df shape: (71901, 100)


In [3]:
df.head()

Unnamed: 0,unit_id_institution,college_name,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,degree_name,...,deg_percent_awarded_social_sciences,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking
0,100654,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,Bachelors Degree,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
1,100654,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,Bachelors Degree,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
2,100654,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,Bachelors Degree,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
5,100654,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,Bachelors Degree,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
6,100654,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,Bachelors Degree,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0


In [8]:
df.to_csv('ba_cleaned.csv')

In [6]:
df = pd.read_csv('ba_cleaned.csv' ,index_col=0)
df.shape

(71901, 100)

In [9]:
df.isnull().sum()

unit_id_institution                               2227
college_name                                         0
state_post_code                                   2889
zip_code                                          2889
city                                              2889
                                                  ... 
deg_percent_awarded_visual_and_performing_arts    2898
deg_percent_awarded_health                        2898
deg_percent_awarded_business_management           2898
deg_percent_awarded_history.1                     2898
non_deg_seeking                                   8065
Length: 100, dtype: int64

In [10]:
def nulls_by_col(df):
    num_missing = df.isnull().sum()
    rows = df.shape[0]
    prcnt_miss = num_missing / rows * 100
    cols_missing = pd.DataFrame({'num_rows_missing': num_missing, 'percent_rows_missing': prcnt_miss})
    return cols_missing.sort_values(by='num_rows_missing', ascending=False)

In [16]:
missing_value = (nulls_by_col(df))[(nulls_by_col(df)).percent_rows_missing != 0]

In [21]:
missing_value.head()

Unnamed: 0,num_rows_missing,percent_rows_missing
avg_net_price_private,33685,46.85
title_IV_student_number,33685,46.85
first_time_pt_student_retention,24298,33.79
comp_rt_ft_150over_expected_time_native_american,20295,28.23
ACT_score_mid,19527,27.16


In [22]:
missing_value.to_csv('ba_missing_value')

In [30]:
# see if we need to bring more features back
x = pd.read_csv('MERGED2018_19_PP.csv')

In [38]:
x1 = x[['TUITIONFEE_IN','TUITIONFEE_OUT']]
x2 = x[['TUITIONFEE_PROG','TUITFTE']]

In [39]:
x1.head()

Unnamed: 0,TUITIONFEE_IN,TUITIONFEE_OUT
0,9744.0,18354.0
1,8568.0,19704.0
2,6900.0,6900.0
3,10714.0,22362.0
4,11068.0,19396.0


In [40]:
x1.isnull().sum()

TUITIONFEE_IN     2941
TUITIONFEE_OUT    3185
dtype: int64

In [34]:
x1.shape

(6807, 4)

In [47]:
in_null = x1[x1.TUITIONFEE_IN.isnull()]
in_null.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2941 entries, 7 to 6806
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   TUITIONFEE_IN   0 non-null      float64
 1   TUITIONFEE_OUT  0 non-null      float64
dtypes: float64(2)
memory usage: 68.9 KB


In [48]:
out_null = x1[x1.TUITIONFEE_OUT.isnull()]
out_null.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3185 entries, 7 to 6806
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   TUITIONFEE_IN   244 non-null    float64
 1   TUITIONFEE_OUT  0 non-null      float64
dtypes: float64(2)
memory usage: 74.6 KB


In [49]:
x2.isnull().sum()

TUITIONFEE_PROG    4446
TUITFTE             503
dtype: int64

In [50]:
x3 = x[['COSTT4_A','COSTT4_P']]
x3.isnull().sum()

COSTT4_A    3376
COSTT4_P    4631
dtype: int64

In [51]:
x4 = x[['MDCOMP_PD','MDCOST_PD','MDEARN_PD','MDCOMP_ALL','MDCOST_ALL','MDEARN_ALL']]
x4.isnull().sum()

MDCOMP_PD     6807
MDCOST_PD     6807
MDEARN_PD     6807
MDCOMP_ALL    6807
MDCOST_ALL    6807
MDEARN_ALL    6807
dtype: int64

In [52]:
x5 = x[['BOOKSUPPLY','ROOMBOARD_ON','OTHEREXPENSE_ON','ROOMBOARD_OFF','OTHEREXPENSE_OFF','OTHEREXPENSE_FAM']]
x5.isnull().sum()

BOOKSUPPLY          3328
ROOMBOARD_ON        4806
OTHEREXPENSE_ON     4804
ROOMBOARD_OFF       3331
OTHEREXPENSE_OFF    3331
OTHEREXPENSE_FAM    3326
dtype: int64

In [62]:
x6 = x[['ROOMBOARD_ON','ROOMBOARD_OFF']]
x6.isnull().sum()

ROOMBOARD_ON     4806
ROOMBOARD_OFF    3331
dtype: int64

In [63]:
a = x6[(x6.ROOMBOARD_ON.isnull()) & (x6.ROOMBOARD_OFF.isnull())]
a.shape

(3254, 2)

In [64]:
x7 = x[['OTHEREXPENSE_ON','OTHEREXPENSE_OFF']]
x7.isnull().sum()

OTHEREXPENSE_ON     4804
OTHEREXPENSE_OFF    3331
dtype: int64

In [66]:
b = x7[(x7.OTHEREXPENSE_ON.isnull()) & (x7.OTHEREXPENSE_OFF.isnull())]
b.shape

(3253, 2)

In [69]:
x8 = x[['OTHEREXPENSE_FAM']]
x8.isnull().sum()

OTHEREXPENSE_FAM    3326
dtype: int64

In [70]:
x9 = x[['NUM41_PUB','NUM41_PRIV','NUM41_PROG','NUM41_OTHER']]
x9.isnull().sum()

NUM41_PUB      4929
NUM41_PRIV     3079
NUM41_PROG     6807
NUM41_OTHER    6807
dtype: int64

In [71]:
c = x9[(x9.NUM41_PUB.isnull()) & (x9.NUM41_PRIV.isnull()) & (x9.NUM41_PROG.isnull()) & (x9.NUM41_OTHER.isnull())]
c.shape

(1201, 4)

In [75]:
x10 = x[['NUM42_PUB','NUM42_PRIV','NUM42_PROG','NUM42_OTHER']]
x10.isnull().sum()

NUM42_PUB      4929
NUM42_PRIV     3079
NUM42_PROG     6807
NUM42_OTHER    6807
dtype: int64

In [76]:
d = x10[(x10.NUM42_PUB.isnull()) & (x10.NUM42_PRIV.isnull()) & (x10.NUM42_PROG.isnull()) & (x10.NUM42_OTHER.isnull())]
d.shape

(1201, 4)

In [78]:
x11 = x[['NUM43_PUB','NUM43_PRIV','NUM43_PROG','NUM43_OTHER']]
x11.isnull().sum()

NUM43_PUB      4929
NUM43_PRIV     3079
NUM43_PROG     6807
NUM43_OTHER    6807
dtype: int64

In [79]:
e = x11[(x11.NUM43_PUB.isnull()) & (x11.NUM43_PRIV.isnull()) & (x11.NUM43_PROG.isnull()) & (x11.NUM43_OTHER.isnull())]
e.shape

(1201, 4)

In [80]:
x12 = x[['NUM44_PUB','NUM44_PRIV','NUM44_PROG','NUM44_OTHER']]
x12.isnull().sum()

NUM44_PUB      4929
NUM44_PRIV     3079
NUM44_PROG     6807
NUM44_OTHER    6807
dtype: int64

In [81]:
f = x12[(x12.NUM44_PUB.isnull()) & (x12.NUM44_PRIV.isnull()) & (x12.NUM44_PROG.isnull()) & (x12.NUM44_OTHER.isnull())]
f.shape

(1201, 4)

In [82]:
x13 = x[['NUM45_PUB','NUM45_PRIV','NUM45_PROG','NUM45_OTHER']]
x13.isnull().sum()

NUM45_PUB      4929
NUM45_PRIV     3079
NUM45_PROG     6807
NUM45_OTHER    6807
dtype: int64

In [83]:
g = x13[(x13.NUM45_PUB.isnull()) & (x13.NUM45_PRIV.isnull()) & (x13.NUM45_PROG.isnull()) & (x13.NUM45_OTHER.isnull())]
g.shape

(1201, 4)