In [1]:
# regular imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import math

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings("ignore")

# Wrangling
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.cluster import KMeans
from scipy import stats
import sklearn.preprocessing
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr, kruskal

import csv
import acquire
import prepare
import ds_acquire
import ds_prepare


In [2]:
df = acquire.get_bach_df()
df = prepare.clean_college_df(df)

dataframe shape: (71901, 119)


In [3]:
df['major_category'] = df.major_name.apply(prepare.categorize_major)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71901 entries, 0 to 224838
Columns: 120 entries, unit_id_institution to major_category
dtypes: Int32(1), float64(84), int64(2), object(33)
memory usage: 66.2+ MB


In [5]:
df.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking,major_category
0,100654,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Agriculture
1,100654,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Biology and Life Sciences
2,100654,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Biology and Life Sciences
5,100654,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Biology and Life Sciences
6,100654,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Agriculture


# _______________________________________________________________________________________________ #
### Initial `earnings_df`

In [6]:
earnings_df = pd.read_csv('earnings_df.csv')

In [7]:
earnings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707462 entries, 0 to 707461
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   Unnamed: 0                707462 non-null  int64 
 1   earning_year              707462 non-null  int64 
 2   earnings_degree           707462 non-null  int64 
 3   earnings_school_type      707462 non-null  int64 
 4   earnings_major            707462 non-null  int64 
 5   earnings_wage/salary      707462 non-null  int64 
 6   EMPSTAT                   707462 non-null  int64 
 7   METRO                     707462 non-null  int64 
 8   SEX                       707462 non-null  int64 
 9   AGE                       707462 non-null  int64 
 10  earnings_race             707462 non-null  int64 
 11  earnings_speaks_english   707462 non-null  int64 
 12  LANGUAGE                  707462 non-null  int64 
 13  earnings_specific_degree  707462 non-null  int64 
 14  stat

In [8]:
earnings_df.head()

Unnamed: 0.1,Unnamed: 0,earning_year,earnings_degree,earnings_school_type,earnings_major,earnings_wage/salary,EMPSTAT,METRO,SEX,AGE,earnings_race,earnings_speaks_english,LANGUAGE,earnings_specific_degree,state_post_code,major_category
0,2762990,2017,101,1,61,38500,1,0,2,31,1,3,1,6100,AL,Medical and Health Sciences and Services
1,2763006,2017,101,1,62,120000,1,4,1,30,2,3,1,6203,AL,Business
2,2763007,2017,101,1,40,50000,1,4,1,26,2,3,1,4002,AL,Interdisciplinary and Multi-Disciplinary Studi...
3,2763029,2017,101,1,61,65000,1,4,2,49,1,3,1,6107,AL,Medical and Health Sciences and Services
4,2763031,2017,101,1,33,42000,1,4,2,34,1,3,1,3301,AL,"English Language, Literature, and Composition"


# _______________________________________________________________________________________________ #
### `earnings_df` pivot table to merge with main df
Newly created pivot table with median earnings by major_category, split by year

In [9]:
earnings_pivot_merge = pd.read_csv('2017_2018_2019_earning_by_major.csv')

In [10]:
earnings_pivot_merge

Unnamed: 0,major_category,2017,2018,2019,Grand Total
0,Agriculture,57605.69,55517.87,61388.93,174512.49
1,Architecture,68643.59,71344.26,75609.81,215597.67
2,"Area, Ethnic, and Civilization Studies",53999.93,56155.89,60997.54,171153.36
3,Biology and Life Sciences,48851.91,50004.54,53463.29,152319.74
4,Business,74813.94,76724.07,79608.39,231146.4
5,Communication Technologies,50630.58,53303.0,56882.68,160816.25
6,Communications,61311.56,63458.16,66997.75,191767.47
7,Computer and Information Sciences,83482.41,87552.61,91321.98,262357.0
8,Construction Services,85101.29,85776.3,91583.25,262460.84
9,Cosmetology Services and Culinary Arts,42217.78,45696.22,48408.92,136322.92


In [11]:
new_df = df.merge(earnings_pivot_merge, how='inner', on='major_category')

In [12]:
new_df.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking,major_category,2017,2018,2019,Grand Total
0,100654,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.02,0.0,0.16,0.0,153.0,Agriculture,57605.69,55517.87,61388.93,174512.49
1,100654,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.02,0.0,0.16,0.0,153.0,Agriculture,57605.69,55517.87,61388.93,174512.49
2,100858,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,3.0,...,0.03,0.06,0.23,0.01,481.0,Agriculture,57605.69,55517.87,61388.93,174512.49
3,100858,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,3.0,...,0.03,0.06,0.23,0.01,481.0,Agriculture,57605.69,55517.87,61388.93,174512.49
4,101541,Judson College,"Private, nonprofit",AL,36756,Marion,5.0,1.0,3.0,3.0,...,0.07,0.24,0.1,0.04,9.0,Agriculture,57605.69,55517.87,61388.93,174512.49


In [13]:
new_df.to_csv('merged_df.csv')

In [17]:
new_df.avg_net_price_public.isnull().sum()

42789

In [16]:
new_df.avg_net_price_public.value_counts()

15,020.00    153
15,970.00    140
30,996.00    138
12,566.00    136
17,704.00    125
            ... 
 6,526.00      1
 4,646.00      1
 8,324.00      1
 3,411.00      1
 4,433.00      1
Name: avg_net_price_public, Length: 759, dtype: int64

In [None]:
new_df.groupby(new_df[''])