# Capstone Project Working Notebook: David

## Acquire and Prepare Simplified Path
Expedition process to fully functional exploration-ready df

In [1]:
# regular imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import math


# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings("ignore")

# Wrangling
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.cluster import KMeans
from scipy import stats
import sklearn.preprocessing
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr, kruskal

import csv
import acquire
import prepare
import ds_acquire
import ds_prepare


In [2]:
df = acquire.get_bach_df()
df = prepare.clean_college_df(df)

dataframe shape: (71901, 139)


In [3]:
df.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,deg_percent_awarded_social_sciences,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking
0,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
1,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
2,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
5,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
6,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71901 entries, 0 to 224838
Columns: 139 entries, unit_id_institution to non_deg_seeking
dtypes: float64(105), int64(2), object(32)
memory usage: 76.8+ MB


In [5]:
cleaned_df = prepare.clean_step1(df)

In [6]:
new_df = prepare.avg_net_price(cleaned_df)

In [7]:
new_df.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking,avg_net_price
0,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,14444.0
1,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,14444.0
2,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,14444.0
5,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,14444.0
6,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,14444.0


In [8]:
new_df['major_category'] = new_df.major_name.apply(prepare.categorize_major)

In [9]:
# earnings_pivot_merge = pd.read_csv('2017_2018_2019_earning_by_major.csv', index_col=0)

In [10]:
# earnings_pivot_merge

In [11]:
# new_df = new_df.merge(earnings_pivot_merge, how='inner', on='major_category')

In [12]:
new_df = prepare.earnings_merge(new_df)

In [13]:
new_df = prepare.create_roi_cols(new_df)

In [14]:
new_df.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,2017,2018,2019,Grand Total,roi_5yr,pct_roi_5yr,roi_10yr,pct_roi_10yr,roi_20yr,pct_roi_20yr
0,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,57605.69,55517.87,61388.93,174512.49,0.41,40.62,1.99,199.0,5.67,566.93
1,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,57605.69,55517.87,61388.93,174512.49,0.41,40.62,1.99,199.0,5.67,566.93
2,100858.0,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,3.0,...,57605.69,55517.87,61388.93,174512.49,0.2,19.89,1.55,154.92,4.69,468.62
3,100858.0,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,3.0,...,57605.69,55517.87,61388.93,174512.49,0.2,19.89,1.55,154.92,4.69,468.62
4,101541.0,Judson College,"Private, nonprofit",AL,36756,Marion,5.0,1.0,3.0,3.0,...,57605.69,55517.87,61388.93,174512.49,0.35,35.12,1.87,187.32,5.41,540.88


In [15]:
new_df.shape

(68546, 145)

In [15]:
new_df.to_csv('df_exploration_ready_with_ROI.csv', index = True)

In [16]:
test_df = pd.read_csv('df_exploration_ready_with_ROI.csv')

In [17]:
test_df

Unnamed: 0.1,Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,...,2017,2018,2019,Grand Total,roi_5yr,pct_roi_5yr,roi_10yr,pct_roi_10yr,roi_20yr,pct_roi_20yr
0,0,100654.00,Alabama A & M University,Public,AL,35762,Normal,5.00,1.00,3.00,...,57605.69,55517.87,61388.93,174512.49,0.41,40.62,1.99,199.00,5.67,566.93
1,1,100654.00,Alabama A & M University,Public,AL,35762,Normal,5.00,1.00,3.00,...,57605.69,55517.87,61388.93,174512.49,0.41,40.62,1.99,199.00,5.67,566.93
2,2,100858.00,Auburn University,Public,AL,36849,Auburn,5.00,1.00,3.00,...,57605.69,55517.87,61388.93,174512.49,0.20,19.89,1.55,154.92,4.69,468.62
3,3,100858.00,Auburn University,Public,AL,36849,Auburn,5.00,1.00,3.00,...,57605.69,55517.87,61388.93,174512.49,0.20,19.89,1.55,154.92,4.69,468.62
4,4,101541.00,Judson College,"Private, nonprofit",AL,36756,Marion,5.00,1.00,3.00,...,57605.69,55517.87,61388.93,174512.49,0.35,35.12,1.87,187.32,5.41,540.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68541,68541,211644.00,Clarion University of Pennsylvania,Public,PA,16214,Clarion,2.00,1.00,3.00,...,37639.57,39932.26,31651.85,109223.68,-0.26,-25.97,0.48,48.22,2.21,220.56
68542,68542,213349.00,Kutztown University of Pennsylvania,Public,PA,19530-0730,Kutztown,2.00,1.00,3.00,...,37639.57,39932.26,31651.85,109223.68,-0.29,-28.92,0.42,42.31,2.08,207.78
68543,68543,230816.00,Bennington College,"Private, nonprofit",VT,05201-6003,Bennington,1.00,1.00,3.00,...,37639.57,39932.26,31651.85,109223.68,-0.36,-36.30,0.28,27.54,1.76,175.84
68544,68544,232025.00,Emory & Henry College,"Private, nonprofit",VA,24327-0947,Emory,5.00,1.00,3.00,...,37639.57,39932.26,31651.85,109223.68,-0.24,-23.83,0.53,52.50,2.30,229.82


# _______________________________________________________________________ #

In [4]:
df.major_name.value_counts()

Business Administration, Management and Operations.           1870
Psychology, General.                                          1565
Biology, General.                                             1468
Liberal Arts and Sciences, General Studies and Humanities.    1434
English Language and Literature, General.                     1409
                                                              ... 
Construction Trades, Other.                                      1
Podiatric Medicine/Podiatry.                                     1
Physics and Astronomy.                                           1
Advanced/Graduate Dentistry and Oral Sciences.                   1
Social Psychology.                                               1
Name: major_name, Length: 375, dtype: int64

In [5]:
df.state_post_code.value_counts()

NY    4829
PA    4709
CA    4195
OH    3616
TX    3456
IL    2524
MA    2346
FL    2253
MI    2233
IN    2068
NC    1986
MO    1907
GA    1671
TN    1644
WI    1630
VA    1622
MN    1616
IA    1387
NJ    1202
SC    1177
WA    1164
CT    1152
KS    1099
KY    1072
OK    1071
AL    1057
PR     963
MD     961
OR     875
LA     859
CO     858
AR     814
NE     775
AZ     733
UT     702
WV     644
MS     584
ME     556
VT     537
NH     509
SD     486
ID     466
RI     435
DC     417
ND     398
NM     374
MT     335
HI     262
NV     249
DE     236
AK     153
WY      71
GU      37
VI      28
MP       6
AS       1
FM       1
MH       1
Name: state_post_code, dtype: int64

In [6]:
df.shape

(71901, 101)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,unit_id_institution,college_name,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,deg_percent_awarded_social_sciences,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking
0,0,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
1,1,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
2,2,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
3,5,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
4,6,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0


In [8]:
# Forming main df into simplified df with new col names
new_df = ds_prepare.clean_col_names(df)

In [9]:
new_df.shape

(71901, 101)

In [10]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,unit_id_institution,college_name,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,deg_percent_awarded_social_sciences,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking
0,0,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
1,1,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
2,2,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
3,5,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
4,6,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0


In [11]:
print(new_df.columns)

Index(['Unnamed: 0', 'unit_id_institution', 'college_name', 'state_post_code',
       'zip_code', 'city', 'region_ipeds', 'title_IV_eligibility',
       'pred_degree', 'pred_degree_0and4',
       ...
       'deg_percent_awarded_social_sciences',
       'deg_percent_awarded_construction_trades',
       'deg_percent_awarded_mechanic_repair',
       'deg_percent_awarded_precision_production',
       'deg_percent_awarded_transportation_materials',
       'deg_percent_awarded_visual_and_performing_arts',
       'deg_percent_awarded_health', 'deg_percent_awarded_business_management',
       'deg_percent_awarded_history', 'non_deg_seeking'],
      dtype='object', length=101)


In [12]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71901 entries, 0 to 71900
Columns: 101 entries, Unnamed: 0 to non_deg_seeking
dtypes: float64(79), int64(3), object(19)
memory usage: 55.4+ MB


In [19]:
new_df.title_IV_eligibility.value_counts()

1.00    68438
2.00      316
3.00      190
5.00       68
Name: title_IV_eligibility, dtype: int64

In [13]:
new_df.major_name.value_counts()

Business Administration, Management and Operations.           1870
Psychology, General.                                          1565
Biology, General.                                             1468
Liberal Arts and Sciences, General Studies and Humanities.    1434
English Language and Literature, General.                     1409
                                                              ... 
Construction Trades, Other.                                      1
Podiatric Medicine/Podiatry.                                     1
Physics and Astronomy.                                           1
Advanced/Graduate Dentistry and Oral Sciences.                   1
Social Psychology.                                               1
Name: major_name, Length: 375, dtype: int64

In [14]:
major_list = new_df.major_name.unique().tolist()

In [15]:
major_list

['Agriculture, General.',
 'Animal Sciences.',
 'Food Science and Technology.',
 'Plant Sciences.',
 'Agriculture, Agriculture Operations, and Related Sciences, Other.',
 'Forestry.',
 'City/Urban, Community and Regional Planning.',
 'Audiovisual Communications Technologies/Technicians.',
 'Computer and Information Sciences, General.',
 'Special Education and Teaching.',
 'Teacher Education and Professional Development, Specific Levels and Methods.',
 'Teacher Education and Professional Development, Specific Subject Areas.',
 'Civil Engineering.',
 'Electrical, Electronics and Communications Engineering.',
 'Mechanical Engineering.',
 'Electrical Engineering Technologies/Technicians.',
 'Mechanical Engineering Related Technologies/Technicians.',
 'Construction Engineering Technologies.',
 'Family and Consumer Sciences/Human Sciences, General.',
 'English Language and Literature, General.',
 'Liberal Arts and Sciences, General Studies and Humanities.',
 'Biology, General.',
 'Mathematic

In [16]:
new_df.major_name.value_counts()

Business Administration, Management and Operations.           1870
Psychology, General.                                          1565
Biology, General.                                             1468
Liberal Arts and Sciences, General Studies and Humanities.    1434
English Language and Literature, General.                     1409
                                                              ... 
Construction Trades, Other.                                      1
Podiatric Medicine/Podiatry.                                     1
Physics and Astronomy.                                           1
Advanced/Graduate Dentistry and Oral Sciences.                   1
Social Psychology.                                               1
Name: major_name, Length: 375, dtype: int64

data = [major_list]

file = open('major_list.csv', 'w+', newline='')

with file:
    write = csv.writer(file)
    write.writerows(data)


In [17]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71901 entries, 0 to 71900
Columns: 101 entries, Unnamed: 0 to non_deg_seeking
dtypes: float64(79), int64(3), object(19)
memory usage: 55.4+ MB


In [18]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,unit_id_institution,college_name,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,deg_percent_awarded_social_sciences,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking
0,0,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
1,1,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
2,2,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
3,5,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0
4,6,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.04,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0


In [16]:
new_df['major_category'] = new_df['major_name']({
    'Agriculture': ['Botany/Plant Biology.','Agricultural Engineering.','Applied Horticulture and Horticultural Business Services.','Agriculture/Veterinary Preparatory Programs.','Soil Sciences.','Agriculture, General.', 'Agriculture, Agriculture Operations, and Related Sciences, Other.', 'Agricultural Production Operations.', 'Agricultural and Domestic Animal Services.','Agricultural Public Services.','Agricultural Mechanization.','International Agriculture.','Agricultural and Food Products Processing.'],
    'Environment and Natural Resources': ['Forest Engineering.','Environmental Control Technologies/Technicians.','Archeology.','Geological/Geophysical Engineering.','Mining and Mineral Engineering.','Natural Resources and Conservation, Other.','Fishing and Fisheries Sciences and Management.','Sustainability Studies.','Historic Preservation and Conservation.','Surveying Engineering.','Ocean Engineering.','Geography and Cartography.','Wildlife and Wildlands Science and Management.','Natural Resources Management and Policy.','Geological and Earth Sciences/Geosciences.','Environmental/Environmental Health Engineering.','Natural Resources Conservation and Research.','Forestry.'],
    'Architecture': ['Drafting/Design Engineering Technologies/Technicians.','Architecture and Related Services, Other.','Architectural Sciences and Technology.','Interior Architecture.','Architectural Engineering.','Architecture.','Environmental Design.'],
    'Area, Ethnic, and Civilization Studies': ['Demography and Population Studies.','Medieval and Renaissance Studies.','Classical and Ancient Studies.','Bilingual, Multilingual, and Multicultural Education.','Museology/Museum Studies.','Science, Technology and Society.','Urban Studies/Affairs.','Cultural Studies/Critical Theory and Analysis.','African Languages, Literatures, and Linguistics.', 'Turkic, Uralic-Altaic, Caucasian, and Central Asian Languages, Literatures, and Linguistics.'],
    'Communications': ['Telecommunications Management.','Communication, Journalism, and Related Programs, Other.','Communication and Media Studies.','Public Relations, Advertising, and Applied Communication.'],
    'Communication Technologies': ['Communications Technologies/Technicians and Support Services, Other.','Educational/Instructional Media Design.','Graphic Communications.','Communications Technology/Technician.','Audiovisual Communications Technologies/Technicians.','Radio, Television, and Digital Communication.'],
    'Computer and Information Sciences': ['Accounting and Computer Science.','Human Computer Interaction.','Data Processing.','Computational Science.','Computer Software and Media Applications.','Computer and Information Sciences and Support Services, Other.','Computer Engineering Technologies/Technicians.','Computer Systems Analysis.','Computer Systems Networking and Telecommunications.','Computer Programming.','Computer/Information Technology Administration and Management.','Computer Science.','Information Science/Studies.','Computer Engineering.','Computer and Information Sciences, General.','Management Information Systems and Services.'],
    'Cosmetology Services and Culinary Arts': ['Personal and Culinary Services, Other.','Cosmetology and Related Personal Grooming Services.','Nutrition Sciences.','Culinary Arts and Related Services.'],
    'Education Administration and Teaching': ['Basic Skills and Developmental/Remedial Education.','Curriculum and Instruction.','High School/Secondary Diploma Programs.','High School/Secondary Certificate Programs.','Social and Philosophical Foundations of Education.','Teaching Assistants/Aides.','Student Counseling and Personnel Services.','Educational Administration and Supervision.','Teaching English or French as a Second or Foreign Language.','Education, Other.','Educational Assessment, Evaluation, and Research.','Education, General.','Special Education and Teaching.','Teacher Education and Professional Development, Specific Levels and Methods.','Teacher Education and Professional Development, Specific Subject Areas.'],
    'Engineering': ['Ceramic Sciences and Engineering.','Electromechanical Engineering.','Biochemical Engineering.','Engineering Chemistry.','Mechatronics, Robotics, and Automation Engineering.','Engineering Mechanics.','Engineering Physics.','Engineering-Related Fields.','Engineering Science.','Petroleum Engineering.','Metallurgical Engineering.','Engineering, Other.','Industrial Engineering.','Chemical Engineering.','Aerospace, Aeronautical and Astronautical Engineering.','Engineering, General.','Electrical, Electronics and Communications Engineering.','Civil Engineering.','Mechanical Engineering.','Materials Engineering'],
    'Engineering Technologies': ['Civil Engineering Technologies/Technicians.','Engineering-Related Technologies.','Engineering Technologies/Technicians, Other.','Engineering Technology, General.','Electrical Engineering Technologies/Technicians.','Mechanical Engineering Related Technologies/Technicians.','Construction Engineering Technologies.'],
    'Linguistics and Foreign Languages': ['Iranian/Persian Languages, Literatures, and Linguistics.','Turkic, Uralic-Altaic, Caucasian, and Central Asian Languages, Literatures, and Linguistics.','African Languages, Literatures, and Linguistics.','Celtic Languages, Literatures, and Linguistics.','South Asian Languages, Literatures, and Linguistics.','Middle/Near Eastern and Semitic Languages, Literatures, and Linguistics.','American Sign Language.','Slavic, Baltic and Albanian Languages, Literatures, and Linguistics.','Foreign Languages, Literatures, and Linguistics, Other.','American Indian/Native American Languages, Literatures, and Linguistics.','East Asian Languages, Literatures, and Linguistics.','Germanic Languages, Literatures, and Linguistics.','Modern Greek Language and Literature.','Southeast Asian and Australasian/Pacific Languages, Literatures, and Linguistics.','Linguistic, Comparative, and Related Language Studies and Services.','Romance Languages, Literatures, and Linguistics.'],
    'Family and Consumer Sciences': ['Work and Family Studies.','Family and Consumer Sciences/Human Sciences Business Services.','Family and Consumer Sciences/Human Sciences, General.','Family and Consumer Sciences/Human Sciences, Other.','Hospitality Administration/Management.','Family and Consumer Economics and Related Studies.'],
    'Law': ['Law.','Legal Professions and Studies, Other.','Legal Research and Advanced Professional Studies.','Legal Support Services.','Non-Professional General Legal Studies (Undergraduate).'],
    'English Language, Literature, and Composition': ['Creative Writing.','Publishing.','English Language and Literature/Letters, Other.','Literature.','Classics and Classical Languages, Literatures, and Linguistics.','English Language and Literature, General.','Journalism.'],
    'Liberal Arts and Humanities': ['Liberal Arts and Sciences, General Studies and Humanities.'],
    'Library Science': ['Library Science and Administration.','Library Science, Other.'],
    'Biology and Life Sciences': ['Neuroscience.','Nanotechnology.','Biology Technician/Biotechnology Laboratory Technician.','Veterinary Medicine.','Maritime Studies.','Marine Sciences.','Pharmacology and Toxicology.','Human Biology.','Veterinary Biomedical and Clinical Sciences.','Atmospheric Sciences and Meteorology.','Biomathematics, Bioinformatics, and Computational Biology.','Cell/Cellular Biology and Anatomical Sciences.','Biological and Physical Sciences.','Biochemistry, Biophysics and Molecular Biology.','Zoology/Animal Biology.','Veterinary/Animal Health Technologies/Technicians.','Microbiological Sciences and Immunology.','Foods, Nutrition, and Related Services.','Ecology, Evolution, Systematics, and Population Biology.','Neurobiology and Neurosciences.','Genetics.','Animal Sciences.','Plant Sciences.','Food Science and Technology.','Chemistry.','Biology, General.','Biomedical/Medical Engineering.'],
    'Mathematics and Statistics': ['Mathematics and Statistics, Other.','Mathematics and Computer Science.','Physics and Astronomy.','Statistics.','Mathematics.','Physics.','Astronomy and Astrophysics.','Applied Mathematics.'],
    'Military Technologies': ['Military Science and Operational Studies.','Military Technologies and Applied Sciences, Other.','Air Force ROTC, Air Science and Operations.','Army ROTC, Military Science and Operations.','Intelligence, Command Control and Information Operations.','Naval Architecture and Marine Engineering.','Military Systems and Maintenance Technology.','Military Applied Sciences.','Security Science and Technology.'],
    'Interdisciplinary and Multi-Disciplinary Studies (General)': ['International and Comparative Education.','Systems Science and Theory.','Intercultural/Multicultural and Diversity Studies.','International/Global Studies.','Multi-/Interdisciplinary Studies, General.','Multi/Interdisciplinary Studies, Other.','Area Studies.'],
    'Physical Fitness, Parks, Recreation, and Leisure': ['Parks, Recreation and Leisure Facilities Management.','Movement and Mind-Body Therapies and Education.','Leisure and Recreational Activities.','Housing and Human Environments.','Landscape Architecture.','Outdoor Education.','Parks, Recreation, Leisure, and Fitness Studies, Other.','Health and Physical Education/Fitness.','Parks, Recreation and Leisure Studies.'],
    'Philosophy and Religious Studies': ['Philosophy and Religious Studies, Other.','Philosophy and Religious Studies, General.','Religious Education.','Philosophy.','Bioethics/Medical Ethics.','Religious/Sacred Music.'],
    'Theology and Religious Vocations': ['Theology and Religious Vocations, Other.','Theological and Ministerial Studies.','Missions/Missionary Studies and Missiology.','Religion/Religious Studies.','Bible/Biblical Studies.','Pastoral Counseling and Specialized Ministries.'],
    'Physical Sciences': ['Somatic Bodywork and Related Therapeutic Services.','Energy and Biologically Based Therapies.','Physical Science Technologies/Technicians.','Physiology, Pathology and Related Sciences.','Natural Sciences.','Physical Sciences.','Physical Sciences, Other.'],
    'Nuclear, Industrial Radiology, and Biological Technologies': ['Nuclear and Industrial Radiologic Technologies/Technicians.','Nuclear Engineering.','Nuclear Engineering Technologies/Technicians.','Science Technologies/Technicians, Other.','Electromechanical Instrumentation and Maintenance Technologies/Technicians.'],
    'Psychology': ['Social Psychology.','Interpersonal and Social Skills.','Cognitive Science.','Biopsychology.','Research and Experimental Psychology.','Psychology, Other.','Clinical, Counseling and Applied Psychology.','Behavioral Sciences.','Clinical Psychology.','Human Development, Family Studies, and Related Services.'],
    'Criminal Justice and Fire Protection': ['Homeland Security.','Homeland Security, Law Enforcement, Firefighting and Related Protective Services, Other.','International Relations and National Security Studies.','Fire Protection.','Criminal Justice and Corrections.','Criminology.'],
    'Public Affairs, Policy, and Social Work': ['Security Policy and Strategy.','Taxation.','Citizenship Activities.','Peace Studies and Conflict Resolution.','Human Services, General.','Community Organization and Advocacy.','Mental and Social Health Services and Allied Professions.','Public Policy Analysis.','Public Administration and Social Service Professions, Other.','Public Administration.','Economics.','Rehabilitation and Therapeutic Professions.','City/Urban, Community and Regional Planning.','Social Work.','Political Science and Government.'],
    'Social Sciences': ['Dispute Resolution.','Sociology and Anthropology.','Rural Sociology.','Social Sciences, General.','Communication Disorders Sciences and Services.','Human Development, Family Studies, and Related Services.','Sociology.','Psychology, General.','Ethnic, Cultural Minority, Gender, and Group Studies.','Anthropology.','Social Sciences, Other.'],
    'Construction Services': ['Carpenters.','Mason/Masonry.','Construction Trades, Other.','Construction Trades, General.','Woodworking.','Electrical and Power Transmission Installers.','Construction Management.','Building/Construction Finishing, Management, and Inspection.','Architectural Engineering Technologies/Technicians.','Heating, Air Conditioning, Ventilation and Refrigeration Maintenance Technology/Technician (HAC, HACR, HVAC, HVACR).','Construction Engineering.'],
    'Electrical and Mechanic Repairs and Technologies': ['Heavy/Industrial Equipment Maintenance Technologies.','Vehicle Maintenance and Repair Technologies.','Electrical/Electronics Maintenance and Repair Technology.','Science Technologies/Technicians, General.','Energy Systems Technologies/Technicians.'],
    'Precision Production and Industrial Arts': ['Paper Science and Engineering.','Precision Metal Working.','Materials Sciences.','Systems Engineering.','Manufacturing Engineering.','Quality Control and Safety Technologies/Technicians.','Industrial Production Technologies/Technicians.','Polymer/Plastics Engineering.','Apparel and Textiles.','Textile Sciences and Engineering.'],
    'Transportation Sciences and Technologies': ['Mining and Petroleum Technologies/Technicians.','Marine Transportation.','Air Transportation.','Transportation and Materials Moving, Other.'],
    'Fine Arts': ['Crafts/Craft Design, Folk Art and Artisanry.','Visual and Performing Arts, Other.','Film/Video and Photographic Arts.','Visual and Performing Arts, General.','Design and Applied Arts.','Dance.','Rhetoric and Composition/Writing Studies.','Fine and Studio Arts.','Music.','Drama/Theatre Arts and Stagecraft.'],
    'Medical and Health Sciences and Services': ['Medical Clinical Sciences/Graduate Medical Studies.','Dentistry.','Alternative and Complementary Medical Support Services.','Optometry.','Health-Related Knowledge and Skills.','Funeral Service and Mortuary Science.','Gerontology.','Ophthalmic and Optometric Support Services and Allied Professions.','Alternative and Complementary Medicine and Medical Systems.','Chiropractic.','Podiatric Medicine/Podiatry.','Advanced/Graduate Dentistry and Oral Sciences.','Alternative and Complementary Medicine and Medical Systems.''Biological and Biomedical Sciences, Other.','Practical Nursing, Vocational Nursing and Nursing Assistants.','Pharmacy, Pharmaceutical Sciences, and Administration.','Medicine.','Medical Illustration and Informatics.','Allied Health and Medical Assisting Services.','Dental Support Services and Allied Professions.','Health/Medical Preparatory Programs.','Biological/Biosystems Engineering.','Biotechnology.','Nursing.','Health Professions and Related Clinical Sciences, Other.','Dietetics and Clinical Nutrition Services.','Registered Nursing, Nursing Administration, Nursing Research and Clinical Nursing.','Clinical/Medical Laboratory Science/Research and Allied Professions.','Public Health.','Health Services/Allied Health/Health Sciences, General.','Health and Medical Administrative Services.','Allied Health Diagnostic, Intervention, and Treatment Professions.'],
    'Business': ['Real Estate Development.','Operations Research.','Real Estate.','Insurance.','Specialized Sales, Merchandising and  Marketing Operations.','Arts, Entertainment,and Media Management.','Business Operations Support and Assistant Services.','Management Sciences and Quantitative Methods.','Business, Management, Marketing, and Related Support Services, Other.','Business/Commerce, General.','International Business.','Agricultural Business and Management.','Human Resources Management and Services.','General Sales, Merchandising and Related Marketing Operations.','Business/Managerial Economics.','Business/Corporate Communications.','Business Administration, Management and Operations.','Accounting and Related Services.','Entrepreneurial and Small Business Operations.','Finance and Financial Management Services.','Marketing.'],
    'History': ['History.','Holocaust and Related Studies.','Architectural History and Criticism.']
})

TypeError: 'Series' object is not callable

In [10]:
def categorize_major(column):
    if column in ['Botany/Plant Biology.','Agricultural Engineering.','Applied Horticulture and Horticultural Business Services.','Agriculture/Veterinary Preparatory Programs.','Soil Sciences.','Agriculture, General.', 'Agriculture, Agriculture Operations, and Related Sciences, Other.', 'Agricultural Production Operations.', 'Agricultural and Domestic Animal Services.','Agricultural Public Services.','Agricultural Mechanization.','International Agriculture.','Agricultural and Food Products Processing.']:
        return "Agriculture"
    elif column in ['Forest Engineering.','Environmental Control Technologies/Technicians.','Archeology.','Geological/Geophysical Engineering.','Mining and Mineral Engineering.','Natural Resources and Conservation, Other.','Fishing and Fisheries Sciences and Management.','Sustainability Studies.','Historic Preservation and Conservation.','Surveying Engineering.','Ocean Engineering.','Geography and Cartography.','Wildlife and Wildlands Science and Management.','Natural Resources Management and Policy.','Geological and Earth Sciences/Geosciences.','Environmental/Environmental Health Engineering.','Natural Resources Conservation and Research.','Forestry.']:
        return "Environment and Natural Resources"
    elif column in ['Drafting/Design Engineering Technologies/Technicians.','Architecture and Related Services, Other.','Architectural Sciences and Technology.','Interior Architecture.','Architectural Engineering.','Architecture.','Environmental Design.']:
        return "Architecture"
    elif column in ['Demography and Population Studies.','Medieval and Renaissance Studies.','Classical and Ancient Studies.','Bilingual, Multilingual, and Multicultural Education.','Museology/Museum Studies.','Science, Technology and Society.','Urban Studies/Affairs.','Cultural Studies/Critical Theory and Analysis.','African Languages, Literatures, and Linguistics.', 'Turkic, Uralic-Altaic, Caucasian, and Central Asian Languages, Literatures, and Linguistics.']:
        return "Area, Ethnic, and Civilization Studies"
    elif column in ['Telecommunications Management.','Communication, Journalism, and Related Programs, Other.','Communication and Media Studies.','Public Relations, Advertising, and Applied Communication.']:
        return "Communications"
    elif column in ['Communications Technologies/Technicians and Support Services, Other.','Educational/Instructional Media Design.','Graphic Communications.','Communications Technology/Technician.','Audiovisual Communications Technologies/Technicians.','Radio, Television, and Digital Communication.']:
        return "Communication Technologies"
    elif column in ['Accounting and Computer Science.','Human Computer Interaction.','Data Processing.','Computational Science.','Computer Software and Media Applications.','Computer and Information Sciences and Support Services, Other.','Computer Engineering Technologies/Technicians.','Computer Systems Analysis.','Computer Systems Networking and Telecommunications.','Computer Programming.','Computer/Information Technology Administration and Management.','Computer Science.','Information Science/Studies.','Computer Engineering.','Computer and Information Sciences, General.','Management Information Systems and Services.']:
        return "Computer and Information Sciences"
    elif column in ['Personal and Culinary Services, Other.','Cosmetology and Related Personal Grooming Services.','Nutrition Sciences.','Culinary Arts and Related Services.']:
        return "Cosmetology Services and Culinary Arts"
    elif column in ['Basic Skills and Developmental/Remedial Education.','Curriculum and Instruction.','High School/Secondary Diploma Programs.','High School/Secondary Certificate Programs.','Social and Philosophical Foundations of Education.','Teaching Assistants/Aides.','Student Counseling and Personnel Services.','Educational Administration and Supervision.','Teaching English or French as a Second or Foreign Language.','Education, Other.','Educational Assessment, Evaluation, and Research.','Education, General.','Special Education and Teaching.','Teacher Education and Professional Development, Specific Levels and Methods.','Teacher Education and Professional Development, Specific Subject Areas.']:
        return "Education Administration and Teaching"
    elif column in ['Ceramic Sciences and Engineering.','Electromechanical Engineering.','Biochemical Engineering.','Engineering Chemistry.','Mechatronics, Robotics, and Automation Engineering.','Engineering Mechanics.','Engineering Physics.','Engineering-Related Fields.','Engineering Science.','Petroleum Engineering.','Metallurgical Engineering.','Engineering, Other.','Industrial Engineering.','Chemical Engineering.','Aerospace, Aeronautical and Astronautical Engineering.','Engineering, General.','Electrical, Electronics and Communications Engineering.','Civil Engineering.','Mechanical Engineering.','Materials Engineering']:
        return "Engineering"
    elif column in ['Civil Engineering Technologies/Technicians.','Engineering-Related Technologies.','Engineering Technologies/Technicians, Other.','Engineering Technology, General.','Electrical Engineering Technologies/Technicians.','Mechanical Engineering Related Technologies/Technicians.','Construction Engineering Technologies.']:
        return "Engineering Technologies"
    elif column in ['Iranian/Persian Languages, Literatures, and Linguistics.','Turkic, Uralic-Altaic, Caucasian, and Central Asian Languages, Literatures, and Linguistics.','African Languages, Literatures, and Linguistics.','Celtic Languages, Literatures, and Linguistics.','South Asian Languages, Literatures, and Linguistics.','Middle/Near Eastern and Semitic Languages, Literatures, and Linguistics.','American Sign Language.','Slavic, Baltic and Albanian Languages, Literatures, and Linguistics.','Foreign Languages, Literatures, and Linguistics, Other.','American Indian/Native American Languages, Literatures, and Linguistics.','East Asian Languages, Literatures, and Linguistics.','Germanic Languages, Literatures, and Linguistics.','Modern Greek Language and Literature.','Southeast Asian and Australasian/Pacific Languages, Literatures, and Linguistics.','Linguistic, Comparative, and Related Language Studies and Services.','Romance Languages, Literatures, and Linguistics.']:
        return "Linguistics and Foreign Languages"
    elif column in ['Work and Family Studies.','Family and Consumer Sciences/Human Sciences Business Services.','Family and Consumer Sciences/Human Sciences, General.','Family and Consumer Sciences/Human Sciences, Other.','Hospitality Administration/Management.','Family and Consumer Economics and Related Studies.']:
        return "Family and Consumer Sciences"
    elif column in ['Law.','Legal Professions and Studies, Other.','Legal Research and Advanced Professional Studies.','Legal Support Services.','Non-Professional General Legal Studies (Undergraduate).']:
        return "Law"
    elif column in ['Creative Writing.','Publishing.','English Language and Literature/Letters, Other.','Literature.','Classics and Classical Languages, Literatures, and Linguistics.','English Language and Literature, General.','Journalism.']:
        return "English Language, Literature, and Composition"
    elif column in ['Liberal Arts and Sciences, General Studies and Humanities.']:
        return "Liberal Arts and Humanities"
    elif column in ['Library Science and Administration.','Library Science, Other.']:
        return "Library Science"
    elif column in ['Neuroscience.','Nanotechnology.','Biology Technician/Biotechnology Laboratory Technician.','Veterinary Medicine.','Maritime Studies.','Marine Sciences.','Pharmacology and Toxicology.','Human Biology.','Veterinary Biomedical and Clinical Sciences.','Atmospheric Sciences and Meteorology.','Biomathematics, Bioinformatics, and Computational Biology.','Cell/Cellular Biology and Anatomical Sciences.','Biological and Physical Sciences.','Biochemistry, Biophysics and Molecular Biology.','Zoology/Animal Biology.','Veterinary/Animal Health Technologies/Technicians.','Microbiological Sciences and Immunology.','Foods, Nutrition, and Related Services.','Ecology, Evolution, Systematics, and Population Biology.','Neurobiology and Neurosciences.','Genetics.','Animal Sciences.','Plant Sciences.','Food Science and Technology.','Chemistry.','Biology, General.','Biomedical/Medical Engineering.']:
        return "Biology and Life Sciences"
    elif column in ['Mathematics and Statistics, Other.','Mathematics and Computer Science.','Physics and Astronomy.','Statistics.','Mathematics.','Physics.','Astronomy and Astrophysics.','Applied Mathematics.']:
        return "Mathematics and Statistics"
    elif column in ['Military Science and Operational Studies.','Military Technologies and Applied Sciences, Other.','Air Force ROTC, Air Science and Operations.','Army ROTC, Military Science and Operations.','Intelligence, Command Control and Information Operations.','Naval Architecture and Marine Engineering.','Military Systems and Maintenance Technology.','Military Applied Sciences.','Security Science and Technology.']:
        return "Military Technologies"
    elif column in ['International and Comparative Education.','Systems Science and Theory.','Intercultural/Multicultural and Diversity Studies.','International/Global Studies.','Multi-/Interdisciplinary Studies, General.','Multi/Interdisciplinary Studies, Other.','Area Studies.']:
        return "Interdisciplinary and Multi-Disciplinary Studies (General)"
    elif column in ['Parks, Recreation and Leisure Facilities Management.','Movement and Mind-Body Therapies and Education.','Leisure and Recreational Activities.','Housing and Human Environments.','Landscape Architecture.','Outdoor Education.','Parks, Recreation, Leisure, and Fitness Studies, Other.','Health and Physical Education/Fitness.','Parks, Recreation and Leisure Studies.']:
        return "Physical Fitness, Parks, Recreation, and Leisure"
    elif column in ['Philosophy and Religious Studies, Other.','Philosophy and Religious Studies, General.','Religious Education.','Philosophy.','Bioethics/Medical Ethics.','Religious/Sacred Music.']:
        return "Philosophy and Religious Studies"
    elif column in ['Theology and Religious Vocations, Other.','Theological and Ministerial Studies.','Missions/Missionary Studies and Missiology.','Religion/Religious Studies.','Bible/Biblical Studies.','Pastoral Counseling and Specialized Ministries.']:
        return "Theology and Religious Vocations"
    elif column in ['Somatic Bodywork and Related Therapeutic Services.','Energy and Biologically Based Therapies.','Physical Science Technologies/Technicians.','Physiology, Pathology and Related Sciences.','Natural Sciences.','Physical Sciences.','Physical Sciences, Other.']:
        return "Physical Sciences"
    elif column in ['Nuclear and Industrial Radiologic Technologies/Technicians.','Nuclear Engineering.','Nuclear Engineering Technologies/Technicians.','Science Technologies/Technicians, Other.','Electromechanical Instrumentation and Maintenance Technologies/Technicians.']:
        return "Nuclear, Industrial Radiology, and Biological Technologies"
    elif column in ['Social Psychology.','Interpersonal and Social Skills.','Cognitive Science.','Biopsychology.','Research and Experimental Psychology.','Psychology, Other.','Clinical, Counseling and Applied Psychology.','Behavioral Sciences.','Clinical Psychology.','Human Development, Family Studies, and Related Services.']:
        return "Psychology"
    elif column in ['Homeland Security.','Homeland Security, Law Enforcement, Firefighting and Related Protective Services, Other.','International Relations and National Security Studies.','Fire Protection.','Criminal Justice and Corrections.','Criminology.']:
        return "Criminal Justice and Fire Protection"
    elif column in ['Security Policy and Strategy.','Taxation.','Citizenship Activities.','Peace Studies and Conflict Resolution.','Human Services, General.','Community Organization and Advocacy.','Mental and Social Health Services and Allied Professions.','Public Policy Analysis.','Public Administration and Social Service Professions, Other.','Public Administration.','Economics.','Rehabilitation and Therapeutic Professions.','City/Urban, Community and Regional Planning.','Social Work.','Political Science and Government.']:
        return "Public Affairs, Policy, and Social Work"
    elif column in ['Dispute Resolution.','Sociology and Anthropology.','Rural Sociology.','Social Sciences, General.','Communication Disorders Sciences and Services.','Human Development, Family Studies, and Related Services.','Sociology.','Psychology, General.','Ethnic, Cultural Minority, Gender, and Group Studies.','Anthropology.','Social Sciences, Other.']:
        return "Social Sciences"
    elif column in ['Carpenters.','Mason/Masonry.','Construction Trades, Other.','Construction Trades, General.','Woodworking.','Electrical and Power Transmission Installers.','Construction Management.','Building/Construction Finishing, Management, and Inspection.','Architectural Engineering Technologies/Technicians.','Heating, Air Conditioning, Ventilation and Refrigeration Maintenance Technology/Technician (HAC, HACR, HVAC, HVACR).','Construction Engineering.']:
        return "Construction Services"
    elif column in ['Heavy/Industrial Equipment Maintenance Technologies.','Vehicle Maintenance and Repair Technologies.','Electrical/Electronics Maintenance and Repair Technology.','Science Technologies/Technicians, General.','Energy Systems Technologies/Technicians.']:
        return "Electrical and Mechanic Repairs and Technologies"
    elif column in ['Paper Science and Engineering.','Precision Metal Working.','Materials Sciences.','Systems Engineering.','Manufacturing Engineering.','Quality Control and Safety Technologies/Technicians.','Industrial Production Technologies/Technicians.','Polymer/Plastics Engineering.','Apparel and Textiles.','Textile Sciences and Engineering.']:
        return "Precision Production and Industrial Arts"
    elif column in ['Mining and Petroleum Technologies/Technicians.','Marine Transportation.','Air Transportation.','Transportation and Materials Moving, Other.']:
        return "Transportation Sciences and Technologies"
    elif column in ['Crafts/Craft Design, Folk Art and Artisanry.','Visual and Performing Arts, Other.','Film/Video and Photographic Arts.','Visual and Performing Arts, General.','Design and Applied Arts.','Dance.','Rhetoric and Composition/Writing Studies.','Fine and Studio Arts.','Music.','Drama/Theatre Arts and Stagecraft.']:
        return "Fine Arts"
    elif column in ['Medical Clinical Sciences/Graduate Medical Studies.','Dentistry.','Alternative and Complementary Medical Support Services.','Optometry.','Health-Related Knowledge and Skills.','Funeral Service and Mortuary Science.','Gerontology.','Ophthalmic and Optometric Support Services and Allied Professions.','Alternative and Complementary Medicine and Medical Systems.','Chiropractic.','Podiatric Medicine/Podiatry.','Advanced/Graduate Dentistry and Oral Sciences.','Alternative and Complementary Medicine and Medical Systems.''Biological and Biomedical Sciences, Other.','Practical Nursing, Vocational Nursing and Nursing Assistants.','Pharmacy, Pharmaceutical Sciences, and Administration.','Medicine.','Medical Illustration and Informatics.','Allied Health and Medical Assisting Services.','Dental Support Services and Allied Professions.','Health/Medical Preparatory Programs.','Biological/Biosystems Engineering.','Biotechnology.','Nursing.','Health Professions and Related Clinical Sciences, Other.','Dietetics and Clinical Nutrition Services.','Registered Nursing, Nursing Administration, Nursing Research and Clinical Nursing.','Clinical/Medical Laboratory Science/Research and Allied Professions.','Public Health.','Health Services/Allied Health/Health Sciences, General.','Health and Medical Administrative Services.','Allied Health Diagnostic, Intervention, and Treatment Professions.']:
        return "Medical and Health Sciences and Services"
    elif column in ['Real Estate Development.','Operations Research.','Real Estate.','Insurance.','Specialized Sales, Merchandising and  Marketing Operations.','Arts, Entertainment,and Media Management.','Business Operations Support and Assistant Services.','Management Sciences and Quantitative Methods.','Business, Management, Marketing, and Related Support Services, Other.','Business/Commerce, General.','International Business.','Agricultural Business and Management.','Human Resources Management and Services.','General Sales, Merchandising and Related Marketing Operations.','Business/Managerial Economics.','Business/Corporate Communications.','Business Administration, Management and Operations.','Accounting and Related Services.','Entrepreneurial and Small Business Operations.','Finance and Financial Management Services.','Marketing.']:
        return "Business"
    elif column in ['History.','Holocaust and Related Studies.','Architectural History and Criticism.']:
        return "History"
    else:
        return "None"


In [11]:
new_df['major_category'] = new_df.major_name.apply(categorize_major)

In [22]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,unit_id_institution,college_name,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking,major_category
0,0,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Agriculture
1,1,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Biology and Life Sciences
2,2,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Biology and Life Sciences
3,5,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Biology and Life Sciences
4,6,100654.0,Alabama A & M University,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Agriculture


In [23]:
new_df.shape

(71901, 102)

In [12]:
earnings_df = pd.read_csv('earnings_df.csv')

In [13]:
df_full = new_df.merge(earnings_df, how = 'inner', on='major_category')

: 

: 

In [None]:
df_full.head()

In [None]:
df_full.info()

# __________________________________________________________________________ #
# Initial Explore Work

**Note:**\
 Moved to end of this working notebook in order to clean explore_workbook

In [None]:
df = acquire.get_bach_df()
df = prepare.clean_college_df(df)

dataframe shape: (71901, 119)


In [None]:
df['major_category'] = df.major_name.apply(prepare.categorize_major)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71901 entries, 0 to 224838
Columns: 120 entries, unit_id_institution to major_category
dtypes: float64(85), int64(2), object(33)
memory usage: 66.4+ MB


In [None]:
df.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,deg_percent_awarded_construction_trades,deg_percent_awarded_mechanic_repair,deg_percent_awarded_precision_production,deg_percent_awarded_transportation_materials,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking,major_category
0,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Agriculture
1,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Biology and Life Sciences
2,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Biology and Life Sciences
5,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Biology and Life Sciences
6,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.16,0.0,153.0,Agriculture


# _______________________________________________________________________________________________ #
### Initial `earnings_df`

In [None]:
earnings_df = pd.read_csv('earnings_df.csv')

In [None]:
earnings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707462 entries, 0 to 707461
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   Unnamed: 0                707462 non-null  int64 
 1   earning_year              707462 non-null  int64 
 2   earnings_degree           707462 non-null  int64 
 3   earnings_school_type      707462 non-null  int64 
 4   earnings_major            707462 non-null  int64 
 5   earnings_wage/salary      707462 non-null  int64 
 6   EMPSTAT                   707462 non-null  int64 
 7   METRO                     707462 non-null  int64 
 8   SEX                       707462 non-null  int64 
 9   AGE                       707462 non-null  int64 
 10  earnings_race             707462 non-null  int64 
 11  earnings_speaks_english   707462 non-null  int64 
 12  LANGUAGE                  707462 non-null  int64 
 13  earnings_specific_degree  707462 non-null  int64 
 14  stat

In [None]:
earnings_df.head()

Unnamed: 0.1,Unnamed: 0,earning_year,earnings_degree,earnings_school_type,earnings_major,earnings_wage/salary,EMPSTAT,METRO,SEX,AGE,earnings_race,earnings_speaks_english,LANGUAGE,earnings_specific_degree,state_post_code,major_category
0,2762990,2017,101,1,61,38500,1,0,2,31,1,3,1,6100,AL,Medical and Health Sciences and Services
1,2763006,2017,101,1,62,120000,1,4,1,30,2,3,1,6203,AL,Business
2,2763007,2017,101,1,40,50000,1,4,1,26,2,3,1,4002,AL,Interdisciplinary and Multi-Disciplinary Studi...
3,2763029,2017,101,1,61,65000,1,4,2,49,1,3,1,6107,AL,Medical and Health Sciences and Services
4,2763031,2017,101,1,33,42000,1,4,2,34,1,3,1,3301,AL,"English Language, Literature, and Composition"


# _______________________________________________________________________________________________ #
### `earnings_df` pivot table to merge with main df
Newly created pivot table with median earnings by major_category, split by year

In [None]:
earnings_pivot_merge = pd.read_csv('2017_2018_2019_earning_by_major.csv')

In [None]:
earnings_pivot_merge

Unnamed: 0,major_category,2017,2018,2019,Grand Total
0,Agriculture,57605.69,55517.87,61388.93,174512.49
1,Architecture,68643.59,71344.26,75609.81,215597.67
2,"Area, Ethnic, and Civilization Studies",53999.93,56155.89,60997.54,171153.36
3,Biology and Life Sciences,48851.91,50004.54,53463.29,152319.74
4,Business,74813.94,76724.07,79608.39,231146.4
5,Communication Technologies,50630.58,53303.0,56882.68,160816.25
6,Communications,61311.56,63458.16,66997.75,191767.47
7,Computer and Information Sciences,83482.41,87552.61,91321.98,262357.0
8,Construction Services,85101.29,85776.3,91583.25,262460.84
9,Cosmetology Services and Culinary Arts,42217.78,45696.22,48408.92,136322.92


In [None]:
new_df = df.merge(earnings_pivot_merge, how='inner', on='major_category')

In [None]:
new_df.shape

(71422, 125)

In [None]:
new_df.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,pred_degree_0and4,...,deg_percent_awarded_visual_and_performing_arts,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history,non_deg_seeking,major_category,2017,2018,2019,Grand Total
0,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.02,0.0,0.16,0.0,153.0,Agriculture,57605.69,55517.87,61388.93,174512.49
1,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,3.0,...,0.02,0.0,0.16,0.0,153.0,Agriculture,57605.69,55517.87,61388.93,174512.49
2,100858.0,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,3.0,...,0.03,0.06,0.23,0.01,481.0,Agriculture,57605.69,55517.87,61388.93,174512.49
3,100858.0,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,3.0,...,0.03,0.06,0.23,0.01,481.0,Agriculture,57605.69,55517.87,61388.93,174512.49
4,101541.0,Judson College,"Private, nonprofit",AL,36756,Marion,5.0,1.0,3.0,3.0,...,0.07,0.24,0.1,0.04,9.0,Agriculture,57605.69,55517.87,61388.93,174512.49


In [None]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71422 entries, 0 to 71421
Columns: 124 entries, unit_id_institution to Grand Total
dtypes: float64(89), int64(2), object(33)
memory usage: 68.1+ MB


In [None]:
# new_df.to_csv('merged_df.csv')

In [None]:
new_df.avg_net_price_public.isnull().sum()

42789

In [None]:
71422 - 42789

28633

In [None]:
new_df.avg_net_price_private.isnull().sum()

33337

In [None]:
new_df.avg_net_price_public.value_counts()

15,020.00    153
15,970.00    140
30,996.00    138
12,566.00    136
17,704.00    125
            ... 
 6,526.00      1
 4,646.00      1
 8,324.00      1
 3,411.00      1
 4,433.00      1
Name: avg_net_price_public, Length: 759, dtype: int64

In [None]:
price_investigation = new_df.groupby(new_df['institution_control'])['avg_net_price_public'].value_counts()

In [None]:
priv_price_investigation = new_df.groupby(new_df['institution_control'])['avg_net_price_private'].value_counts()

In [None]:
price_investigation.info()

<class 'pandas.core.series.Series'>
MultiIndex: 759 entries, ('Public', 15020.0) to ('Public', 44661.0)
Series name: avg_net_price_public
Non-Null Count  Dtype
--------------  -----
759 non-null    int64
dtypes: int64(1)
memory usage: 14.2+ KB


In [None]:
priv_price_investigation.info()

<class 'pandas.core.series.Series'>
MultiIndex: 1547 entries, ('Private, for-profit', 21197.0) to ('Public', 27000.0)
Series name: avg_net_price_private
Non-Null Count  Dtype
--------------  -----
1547 non-null   int64
dtypes: int64(1)
memory usage: 28.8+ KB


In [None]:
price_investigation.to_csv('price_investigation.csv')

In [None]:
priv_price_investigation.to_csv('priv_price_investigation.csv')

In [None]:
# new_df.groupby(new_df['institution_control']=='Public')['avg_net_price_private'].value_counts()

In [None]:
new_df.avg_net_price_private

0                        NaN
1                        NaN
2                        NaN
3                        NaN
4                  16,619.00
                ...         
71417                    NaN
71418                    NaN
71419              29,386.00
71420              18,182.00
71421                    NaN
Name: avg_net_price_private, Length: 71422, dtype: float64

In [None]:
new_df.avg_net_price_program.isnull().sum()

71422

In [None]:
new_df.avg_net_price_other.isnull().sum()

71422

In [None]:
new_df.avg_net_price_public

0                  14,444.00
1                  14,444.00
2                  23,696.00
3                  23,696.00
4                        NaN
                ...         
71417              19,836.00
71418              22,282.00
71419                    NaN
71420                    NaN
71421              12,566.00
Name: avg_net_price_public, Length: 71422, dtype: float64

In [None]:
# new_df.replace(to_replace=new_df['avg_net_price_public'] == 'NaN', value=None)

In [None]:
new_df.avg_net_price_public

0                  14,444.00
1                  14,444.00
2                  23,696.00
3                  23,696.00
4                        NaN
                ...         
71417              19,836.00
71418              22,282.00
71419                    NaN
71420                    NaN
71421              12,566.00
Name: avg_net_price_public, Length: 71422, dtype: float64

For sake of expedition, we are not completing individual application of joining avg_net_price columns.
Instead, Chenchen is uploading her new csv file with appropriate changes to the Google Drive so we can press forward with exploration.

In [None]:
explore_df = pd.read_csv('merged_df_with_avg_price.csv')

In [None]:
explore_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71422 entries, 0 to 71421
Columns: 126 entries, Unnamed: 0 to avg_net_price
dtypes: float64(115), int64(3), object(8)
memory usage: 68.7+ MB


In [None]:
explore_df.head()

Unnamed: 0.1,Unnamed: 0,unit_id_institution,college_name,institution_control,state_post_code,zip_code,city,region_ipeds,title_IV_eligibility,pred_degree,...,deg_percent_awarded_health,deg_percent_awarded_business_management,deg_percent_awarded_history.1,non_deg_seeking,major_category,2017,2018,2019,Grand Total,avg_net_price
0,0,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,...,0.0,0.16,0.0,153.0,Agriculture,57605.69,55517.87,61388.93,174512.49,14444.0
1,1,100654.0,Alabama A & M University,Public,AL,35762,Normal,5.0,1.0,3.0,...,0.0,0.16,0.0,153.0,Agriculture,57605.69,55517.87,61388.93,174512.49,14444.0
2,2,100858.0,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,...,0.06,0.23,0.01,481.0,Agriculture,57605.69,55517.87,61388.93,174512.49,23696.0
3,3,100858.0,Auburn University,Public,AL,36849,Auburn,5.0,1.0,3.0,...,0.06,0.23,0.01,481.0,Agriculture,57605.69,55517.87,61388.93,174512.49,23696.0
4,4,101541.0,Judson College,"Private, nonprofit",AL,36756,Marion,5.0,1.0,3.0,...,0.24,0.1,0.04,9.0,Agriculture,57605.69,55517.87,61388.93,174512.49,16619.0


In [None]:
explore_df['deg_percent_awarded_history.1'].sum()

866.0209999999998

In [None]:
explore_df.groupby(explore_df['unit_id_institution'])['deg_percent_awarded_history.1'].sum()

unit_id_institution
100,654.00                   0.00
100,663.00                   0.46
100,690.00                   0.00
100,706.00                   0.41
100,724.00                   0.14
                     ...         
494,287.00                   0.00
494,630.00                   0.00
494,685.00                   0.00
494,737.00                   0.00
494,807.00                   0.00
Name: deg_percent_awarded_history.1, Length: 2706, dtype: float64

In [None]:
explore_df.deg_percent_awarded_engineering

0                       0.12
1                       0.12
2                       0.19
3                       0.19
4                       0.00
                ...         
71417                   0.00
71418                   0.00
71419                   0.00
71420                   0.00
71421                   0.10
Name: deg_percent_awarded_engineering, Length: 71422, dtype: float64

In [None]:
explore_df.groupby(explore_df['college_name'])['deg_percent_awarded_engineering'].agg([sum, 'mean', 'median'], axis=1)

Unnamed: 0_level_0,sum,mean,median
college_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AI Miami International University of Art and Design,0.00,0.00,0.00
ASA College,0.00,0.00,0.00
ATA College,0.00,0.00,0.00
ATI College-Norwalk,0.00,0.00,0.00
Aarhus University,0.00,,
...,...,...,...
York College of Pennsylvania,3.02,0.06,0.06
York St John University,0.00,,
York University,0.00,,
Young Harris College,0.00,0.00,0.00


In [None]:
explore_df.college_name.nunique()

3002

In [None]:
# explore_df.to_csv('df_exploration_ready_with_ROI.csv')