In [1]:
# notebook dependencies 
%matplotlib inline
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 300

import pandas as pd
import numpy as np
import os

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# time module
from time import time

from initial_acquire import get_majors_df
import acquire
import prepare

In [2]:
# import ipums table

df = pd.read_csv("earnings_df.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,earning_year,earnings_degree,earnings_school_type,earnings_degree.1,earnings_wage/salary,EMPSTAT,METRO,SEX,AGE,earnings_race,earnings_speaks_english,LANGUAGE,earnings_specific_degree,state_post_code,major_category
0,2762990,2017,101,1,61,38500,1,0,2,31,1,3,1,6100,AL,Medical and Health Sciences and Services
1,2763006,2017,101,1,62,120000,1,4,1,30,2,3,1,6203,AL,Business
2,2763007,2017,101,1,40,50000,1,4,1,26,2,3,1,4002,AL,Interdisciplinary and Multi-Disciplinary Studi...
3,2763029,2017,101,1,61,65000,1,4,2,49,1,3,1,6107,AL,Medical and Health Sciences and Services
4,2763031,2017,101,1,33,42000,1,4,2,34,1,3,1,3301,AL,"English Language, Literature, and Composition"


In [3]:
# let's rename the first columns and cache it

df = df.rename(columns = {"Unnamed: 0": "record_id"})
df.head() # checks out!

Unnamed: 0,record_id,earning_year,earnings_degree,earnings_school_type,earnings_degree.1,earnings_wage/salary,EMPSTAT,METRO,SEX,AGE,earnings_race,earnings_speaks_english,LANGUAGE,earnings_specific_degree,state_post_code,major_category
0,2762990,2017,101,1,61,38500,1,0,2,31,1,3,1,6100,AL,Medical and Health Sciences and Services
1,2763006,2017,101,1,62,120000,1,4,1,30,2,3,1,6203,AL,Business
2,2763007,2017,101,1,40,50000,1,4,1,26,2,3,1,4002,AL,Interdisciplinary and Multi-Disciplinary Studi...
3,2763029,2017,101,1,61,65000,1,4,2,49,1,3,1,6107,AL,Medical and Health Sciences and Services
4,2763031,2017,101,1,33,42000,1,4,2,34,1,3,1,3301,AL,"English Language, Literature, and Composition"


In [4]:
# let's get the info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707462 entries, 0 to 707461
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   record_id                 707462 non-null  int64 
 1   earning_year              707462 non-null  int64 
 2   earnings_degree           707462 non-null  int64 
 3   earnings_school_type      707462 non-null  int64 
 4   earnings_degree.1         707462 non-null  int64 
 5   earnings_wage/salary      707462 non-null  int64 
 6   EMPSTAT                   707462 non-null  int64 
 7   METRO                     707462 non-null  int64 
 8   SEX                       707462 non-null  int64 
 9   AGE                       707462 non-null  int64 
 10  earnings_race             707462 non-null  int64 
 11  earnings_speaks_english   707462 non-null  int64 
 12  LANGUAGE                  707462 non-null  int64 
 13  earnings_specific_degree  707462 non-null  int64 
 14  stat

In [5]:
# shape?

df.shape

(707462, 16)

In [6]:
# any null values?

for col in df.columns:
    print(f'column name: {df[col].name}')
    print(f'percentage of nulls: {df[col].isnull().mean()}')
    print('-------------------------------------')

column name: record_id
percentage of nulls: 0.0
-------------------------------------
column name: earning_year
percentage of nulls: 0.0
-------------------------------------
column name: earnings_degree
percentage of nulls: 0.0
-------------------------------------
column name: earnings_school_type
percentage of nulls: 0.0
-------------------------------------
column name: earnings_degree.1
percentage of nulls: 0.0
-------------------------------------
column name: earnings_wage/salary
percentage of nulls: 0.0
-------------------------------------
column name: EMPSTAT
percentage of nulls: 0.0
-------------------------------------
column name: METRO
percentage of nulls: 0.0
-------------------------------------
column name: SEX
percentage of nulls: 0.0
-------------------------------------
column name: AGE
percentage of nulls: 0.0
-------------------------------------
column name: earnings_race
percentage of nulls: 0.0
-------------------------------------
column name: earnings_speaks_

In [15]:
# what values are contained in ea. column 

for col in df.columns:
    print(f'column name: {df[col].name}')
    print(f'top 10 unique values:\n')
    print(pd.Series(df[col].value_counts().head(10)))
    print('-------------------------------------')

column name: record_id
top 10 unique values:

2762990    1
5610124    1
5610076    1
5610096    1
5610104    1
5610107    1
5610108    1
5610114    1
5610121    1
5610123    1
Name: record_id, dtype: int64
-------------------------------------
column name: earning_year
top 10 unique values:

2019    242830
2018    235452
2017    229180
Name: earning_year, dtype: int64
-------------------------------------
column name: earnings_degree
top 10 unique values:

101    707462
Name: earnings_degree, dtype: int64
-------------------------------------
column name: earnings_school_type
top 10 unique values:

1    626725
2     53144
3     27593
Name: earnings_school_type, dtype: int64
-------------------------------------
column name: earnings_degree.1
top 10 unique values:

62    165165
61     55930
24     52421
23     50062
55     48004
19     40943
60     37403
52     32400
21     31846
36     27940
Name: earnings_degree.1, dtype: int64
-------------------------------------
column name: earnin

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707462 entries, 0 to 707461
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   record_id                 707462 non-null  int64 
 1   earning_year              707462 non-null  int64 
 2   earnings_degree           707462 non-null  int64 
 3   earnings_school_type      707462 non-null  int64 
 4   earnings_degree.1         707462 non-null  int64 
 5   earnings_wage/salary      707462 non-null  int64 
 6   EMPSTAT                   707462 non-null  int64 
 7   METRO                     707462 non-null  int64 
 8   SEX                       707462 non-null  int64 
 9   AGE                       707462 non-null  int64 
 10  earnings_race             707462 non-null  int64 
 11  earnings_speaks_english   707462 non-null  int64 
 12  LANGUAGE                  707462 non-null  int64 
 13  earnings_specific_degree  707462 non-null  int64 
 14  stat

In [17]:
df["earnings_wage/salary"].nunique()

1004

In [None]:
# incorrect dtype from IPUMS dataset



- language
- sex 
- 

In [198]:
# importing bach table

bach_df = acquire.get_bach_df()
bach_df.head()

dataframe shape: (71901, 125)


Unnamed: 0,UNITID,INSTNM_x,CONTROL_x,CITY,STABBR,ZIP,PFTFTUG1_EF,PPTUG_EF,PREDDEG,REGION,...,IND_DEBT_MDN,LO_INC_DEBT_MDN,MALE_DEBT_MDN,MD_INC_DEBT_MDN,NOPELL_DEBT_MDN,NOTFIRSTGEN_DEBT_MDN,HI_INC_DEBT_MDN,GRAD_DEBT_MDN,FTFTPCTFLOAN,FTFTPCTPELL
0,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057
1,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057
2,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057
5,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057
6,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057


In [199]:
# let's clean column names/nulls

bach_df = prepare.clean_college_df(bach_df)
bach_df.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,city,state_post_code,zip_code,share_entering_students_first_ft,share_of_part_time,pred_degree,religion_ipeds,...,median_debt_independent,median_debt_0_30000,median_debt_male,median_debt_30001_75000,median_debt_non_pell,median_debt_non_first_generation,median_debt_75001+,median_debt_completed,fedral_loan_full_time_first_time_undergraduate,pell_grant_full_time_first_time_undergraduate
0,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057
1,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057
2,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057
5,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057
6,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057


In [11]:
bach_df["major_name"].head()

0                                Agriculture, General.
1                                     Animal Sciences.
2                         Food Science and Technology.
5                                      Plant Sciences.
6    Agriculture, Agriculture Operations, and Relat...
Name: major_name, dtype: object

In [200]:
# group the bach df by major

bach_df['major_category'] = bach_df.major_name.apply(prepare.categorize_major)
bach_df.head()

Unnamed: 0,unit_id_institution,college_name,institution_control,city,state_post_code,zip_code,share_entering_students_first_ft,share_of_part_time,pred_degree,religion_ipeds,...,median_debt_0_30000,median_debt_male,median_debt_30001_75000,median_debt_non_pell,median_debt_non_first_generation,median_debt_75001+,median_debt_completed,fedral_loan_full_time_first_time_undergraduate,pell_grant_full_time_first_time_undergraduate,major_category
0,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,Agriculture
1,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,Biology and Life Sciences
2,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,Biology and Life Sciences
5,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,Biology and Life Sciences
6,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,Agriculture


In [151]:
majors_df = get_majors_df()
majors_df.head()

  df = pd.read_csv(filename)


dataframe shape: (224849, 3110)


Unnamed: 0.1,Unnamed: 0,UNITID,OPEID6_x,INSTNM_x,CONTROL_x,MAIN_x,CIPCODE,CIPDESC,CREDLEV,CREDDESC,...,COUNT_WNE_MALE1_P8,MD_EARN_WNE_MALE1_P8,GT_THRESHOLD_P10,MD_EARN_WNE_INC1_P10,MD_EARN_WNE_INC2_P10,MD_EARN_WNE_INC3_P10,MD_EARN_WNE_INDEP1_P10,MD_EARN_WNE_INDEP0_P10,MD_EARN_WNE_MALE0_P10,MD_EARN_WNE_MALE1_P10
0,0,100654.0,1002,Alabama A & M University,Public,1,100,"Agriculture, General.",3,Bachelors Degree,...,834.0,36639.0,0.6044,34076.0,35597.0,43145.0,40299.0,35424.0,36050.0,36377.0
1,1,100654.0,1002,Alabama A & M University,Public,1,109,Animal Sciences.,3,Bachelors Degree,...,834.0,36639.0,0.6044,34076.0,35597.0,43145.0,40299.0,35424.0,36050.0,36377.0
2,2,100654.0,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,3,Bachelors Degree,...,834.0,36639.0,0.6044,34076.0,35597.0,43145.0,40299.0,35424.0,36050.0,36377.0
3,3,100654.0,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,5,Master's Degree,...,834.0,36639.0,0.6044,34076.0,35597.0,43145.0,40299.0,35424.0,36050.0,36377.0
4,4,100654.0,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,6,Doctoral Degree,...,834.0,36639.0,0.6044,34076.0,35597.0,43145.0,40299.0,35424.0,36050.0,36377.0


In [218]:
bach_df = acquire.get_bach_df()
bach_df = prepare.clean_college_df(bach_df)
bach_df.head(3)

dataframe shape: (71901, 125)


Unnamed: 0,unit_id_institution,college_name,institution_control,city,state_post_code,zip_code,share_entering_students_first_ft,share_of_part_time,pred_degree,religion_ipeds,...,median_debt_independent,median_debt_0_30000,median_debt_male,median_debt_30001_75000,median_debt_non_pell,median_debt_non_first_generation,median_debt_75001+,median_debt_completed,fedral_loan_full_time_first_time_undergraduate,pell_grant_full_time_first_time_undergraduate
0,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057
1,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057
2,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,19000,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057


In [225]:
def avg_net_price(df):
    '''Function that castes a new 'average net price' column from 
    existing avg net public and private columns.
    
    This function takes in a dataframe and re-labels null values as 0 in order
    to add across the two avg net price observations.'''

    df['avg_net_price_public'] = df['avg_net_price_public'].fillna(0)

    df['avg_net_price_private'] = df['avg_net_price_private'].fillna(0)

    df['avg_net_price'] = df.avg_net_price_public + df.avg_net_price_private
    
    # drop redundant columns
    # df = df.drop(columns = ["avg_net_price_public", "avg_net_price_private"])

    # return the dataframe
    return df

In [226]:
# run the function 

bach_df = avg_net_price(bach_df)
bach_df.head(4)

Unnamed: 0,unit_id_institution,college_name,institution_control,city,state_post_code,zip_code,share_entering_students_first_ft,share_of_part_time,pred_degree,religion_ipeds,...,median_debt_0_30000,median_debt_male,median_debt_30001_75000,median_debt_non_pell,median_debt_non_first_generation,median_debt_75001+,median_debt_completed,fedral_loan_full_time_first_time_undergraduate,pell_grant_full_time_first_time_undergraduate,avg_net_price
0,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,14444.0
1,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,14444.0
2,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,14444.0
5,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,14444.0


In [228]:
# check the function works correctly 
# checks out!

bach_df[(bach_df['avg_net_price_private'] != 0) & (bach_df['avg_net_price'] != 0)][['avg_net_price_private', 'avg_net_price_public', 'avg_net_price']]

Unnamed: 0,avg_net_price_private,avg_net_price_public,avg_net_price
0,14444.0,0.0,14444.0
1,14444.0,0.0,14444.0
2,14444.0,0.0,14444.0
5,14444.0,0.0,14444.0
6,14444.0,0.0,14444.0
...,...,...,...
215314,10605.0,0.0,10605.0
215316,10605.0,0.0,10605.0
215317,10605.0,0.0,10605.0
215318,10605.0,0.0,10605.0


In [229]:
# check the function works correctly 
# checks out!

bach_df[(bach_df['avg_net_price_public'] != 0) & (bach_df['avg_net_price'] != 0)][['avg_net_price_private', 'avg_net_price_public', 'avg_net_price']]

Unnamed: 0,avg_net_price_private,avg_net_price_public,avg_net_price
221,0.0,15322.0,15322.0
226,0.0,15322.0,15322.0
231,0.0,15322.0,15322.0
235,0.0,15322.0,15322.0
239,0.0,15322.0,15322.0
...,...,...,...
215713,0.0,14061.0,14061.0
215714,0.0,14061.0,14061.0
215716,0.0,14061.0,14061.0
215718,0.0,14061.0,14061.0


In [215]:
bach_df = avg_net_price(bach_df)
bach_df.shape

(71901, 124)

In [230]:
bach_df.tail()

Unnamed: 0,unit_id_institution,college_name,institution_control,city,state_post_code,zip_code,share_entering_students_first_ft,share_of_part_time,pred_degree,religion_ipeds,...,median_debt_0_30000,median_debt_male,median_debt_30001_75000,median_debt_non_pell,median_debt_non_first_generation,median_debt_75001+,median_debt_completed,fedral_loan_full_time_first_time_undergraduate,pell_grant_full_time_first_time_undergraduate,avg_net_price
224794,,Regent's University London,Foreign,,,,,,,,...,,,,,,,,,,0.0
224797,,Regent's University London,Foreign,,,,,,,,...,,,,,,,,,,0.0
224812,,Morthland College,"Private, nonprofit",,,,,,,,...,,,,,,,,,,0.0
224813,,Morthland College,"Private, nonprofit",,,,,,,,...,,,,,,,,,,0.0
224838,,Ravensbourne University London,Foreign,,,,,,,,...,,,,,,,,,,0.0


In [186]:
public = bach_df.loc[bach_df["avg_net_price_public"].notnull()]["avg_net_price_public"]
nulls = bach_df.loc[(bach_df["avg_net_price_public"].isnull()) & (bach_df["avg_net_price_private"].isnull())]
private = bach_df.loc[bach_df["avg_net_price_private"].notnull()]["avg_net_price_private"]

test = pd.concat(
    [private, nulls, public],
    axis=0,
    join="outer",
    ignore_index=False)

len(test)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [176]:
test

0         14444.0
1         14444.0
2         14444.0
5         14444.0
6         14444.0
           ...   
215713    14061.0
215714    14061.0
215716    14061.0
215718    14061.0
215720    14061.0
Length: 71901, dtype: float64

In [181]:
bach_df.sort_index()

Unnamed: 0,unit_id_institution,college_name,institution_control,city,state_post_code,zip_code,share_entering_students_first_ft,share_of_part_time,pred_degree,religion_ipeds,...,median_debt_0_30000,median_debt_male,median_debt_30001_75000,median_debt_non_pell,median_debt_non_first_generation,median_debt_75001+,median_debt_completed,fedral_loan_full_time_first_time_undergraduate,pell_grant_full_time_first_time_undergraduate,major_category
0,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,Agriculture
1,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,Biology and Life Sciences
2,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,Biology and Life Sciences
5,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,Biology and Life Sciences
6,100654.0,Alabama A & M University,Public,Normal,AL,35762,0.8987,0.0587,3.0,5.0,...,15500,14250,16000,10250,16421,14518,33375,0.7143,0.7057,Agriculture
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224794,,Regent's University London,Foreign,,,,,,,,...,,,,,,,,,,Business
224797,,Regent's University London,Foreign,,,,,,,,...,,,,,,,,,,Business
224812,,Morthland College,"Private, nonprofit",,,,,,,,...,,,,,,,,,,"Area, Ethnic, and Civilization Studies"
224813,,Morthland College,"Private, nonprofit",,,,,,,,...,,,,,,,,,,Business


In [145]:
private = bach_df.iloc[bach_df["avg_net_price_private"].index.notnull()]["avg_net_price_private"]
public = bach_df.iloc[bach_df["avg_net_price_public"].index.notnull()]["avg_net_price_public"]

test = pd.concat(
    [private, public],
    axis=0,
    join="outer",
    ignore_index=False)

len(test)

143802

In [127]:
frame1 = bach_df.loc[bach_df["avg_net_price_public"].notnull()]["avg_net_price_public"]
frame2 = bach_df.loc[bach_df["avg_net_price_private"].notnull()]["avg_net_price_private"]


# a['copy_index'] = a.index
# a.merge(b, how='left')
frame2 = frame2.to_frame()

frame2["index_copy"] = frame2.index

frame1 = frame1.to_frame()
# df1.merge(df2, how="inner", left_index=True, right_index=True

frame2.merge(frame1, how = "inner", left_index=True, right_index=True)

Unnamed: 0,avg_net_price_private,index_copy,avg_net_price_public


In [87]:
bach_df["avg_net_price_public"].value_counts()

20989.0    123
31305.0    109
23889.0    104
28568.0    104
29386.0    103
          ... 
40973.0      1
31169.0      1
29808.0      1
32299.0      1
10854.0      1
Name: avg_net_price_public, Length: 1534, dtype: int64

In [97]:
test["avg_net_price_private"] = np.where(
    test["avg_net_price_private"].isnull(),
    test["avg_net_price_public"],
    test["avg_net_price_private"])

test

Unnamed: 0,avg_net_price_private,avg_net_price_public
0,14444.0,
1,14444.0,
2,14444.0,
5,14444.0,
6,14444.0,
...,...,...
224794,,
224797,,
224812,,
224813,,


In [98]:
frame1 = pd.DataFrame(bach_df["avg_net_price_private"])
frame2 = pd.DataFrame(bach_df["avg_net_price_public"])

frames = [frame1, frame2]

test = pd.concat(frames, axis = 0)
test

Unnamed: 0,avg_net_price_private,avg_net_price_public
0,14444.0,
1,14444.0,
2,14444.0,
5,14444.0,
6,14444.0,
...,...,...
224794,,
224797,,
224812,,
224813,,


In [None]:
indx_lst = test.index.tolist()
indx_lst

In [49]:
test = pd.DataFrame(test).rename(columns = {0: "test"})
test.head()

Unnamed: 0,test
0,14444.0
1,14444.0
2,14444.0
5,14444.0
6,14444.0


In [66]:
# add df/series to original df
test_df = bach_df[[
"deg_percent_awarded_agriculture_operations",
"deg_percent_awarded_natural_resources"
]]

test_df.head(20)

Unnamed: 0,deg_percent_awarded_agriculture_operations,deg_percent_awarded_natural_resources
0,0.0394,0.0237
1,0.0394,0.0237
2,0.0394,0.0237
3,0.0394,0.0237
4,0.0394,0.0237
5,0.0394,0.0237
6,0.0394,0.0237
7,0.0394,0.0237
8,0.0394,0.0237
9,0.0394,0.0237


In [68]:
bach_df.shape


(71901, 126)