In [12]:
## setup
import pandas as pd
import numpy as np
import os
import requests
import zipfile
import glob

In [13]:
### GLOBALS

# zip file factory - returns a pandas dataframe
def zip_parser(url=None, survey=None):
    # setup the tmp path and file name
    # thanks to https://stackoverflow.com/questions/55718917/download-zip-file-locally-to-tempfile-extract-files-to-tempfile-and-list-the-f/55719124#55719124
    path = "/tmp/"
    file = survey + ".zip"
    survey_lower = survey.lower()
    # get the data
    try:
      results = requests.get(url)
    except:
      pass
    with open(path + file, 'wb') as f:
        f.write(results.content)
    # extract the files to the path
    file = zipfile.ZipFile(path + file)
    file.extractall(path=path)
    # list the csv files for the surveys, most likely get one , but may get to with _rv for revised
    files = glob.glob(path + "*"+survey_lower+"*")
    # isolate the file name
    if len(files) > 1:
        raw_file = [s for s in files if 'rv' in s]
        raw_file = str(raw_file[0]) # just in case, take first
    else:
        raw_file = str(files[0])
    # return a string
    return(str(raw_file))

# utility to bring in an individual survey file and return a pandas dataframe
def read_survey(path):
    if isinstance(path, list):
        path = path[0]
    # assumes a path, presumably from zip_parser
    try:
        ## encoding option needed for h2017, at least, wasnt needed for IC2013
        survey_file = pd.read_csv(path, encoding='ISO-8859-1')
    except:
        survey_file = pd.DataFrame({'path':path})
    # column names to lower - helps later and assumes a survey varname is historically unique
    survey_file.columns = survey_file.columns.str.lower()
    # add the survey
    return(survey_file)


class IC(object):
    """docstring"""
    
    # init
    def __init__(self, years=[2017]):
        """Constructor"""
        assert isinstance(years, list), "year is not a list of integers representing 4-digit year for survey"
        self.years = years
        
    # testing
    def get_test(self):
        for year in self.years:
            print(year)


    # method to get the data and return a dataframe
    def get(self):
        # setup the df
        init_df = pd.DataFrame({'pypeds_init': [True]})
        for year in self.years:
            # assert that year is a int and length 1
            assert isinstance(year, int), "year is not an integer"
            assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017"
            # build the SURVEY id
            SURVEY = 'IC' + str(year)
            # build the url
            URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY)
            # return the bits as a dictionary for use later
            year_info = {'url': URL, 'survey': SURVEY}
            #year_info = get_efc(year)
            year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey'])
            tmp_df = read_survey(year_fpath)
            tmp_df.columns = tmp_df.columns.str.lower()
            tmp_df['survey_year'] = int(year)
            tmp_df['fall_year'] = int(year)
            init_df = init_df.append(tmp_df, ignore_index=True, sort=False)
            # print("finished hd for year {}".format(str(year)))
        # finish up
        # ignore pandas SettingWithCopyWarning, basically
        pd.options.mode.chained_assignment = None
        init_df = init_df.loc[init_df.pypeds_init != True, ]
        init_df.drop(columns=['pypeds_init'], inplace=True)
        return(init_df)
    
        
    
    

In [14]:
x = IC()

In [15]:
x

<__main__.IC at 0x115c38fd0>

In [16]:
x.years

[2017]

In [19]:
# should return an assertion error
x = IC(2014)
x

AssertionError: year is not a list of integers representing 4-digit year for survey

In [20]:
x.years

[2017]

In [21]:
x = IC(years=[2014])

In [22]:
x.years

[2014]

In [23]:
y = x.get_test()

2014


In [24]:
x = IC(years=[2015,2016])

In [25]:
x.get_test()

2015
2016


In [26]:
x = IC()

In [27]:
x.years

[2017]

In [28]:
y = x.get()

In [29]:
y.head()

Unnamed: 0,unitid,peo1istr,peo2istr,peo3istr,peo4istr,peo5istr,peo6istr,cntlaffi,pubprime,pubsecon,...,sport1,confno1,sport2,confno2,sport3,confno3,sport4,confno4,survey_year,fall_year
1,100654.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,...,1.0,133.0,1.0,133.0,1.0,133.0,1.0,200.0,2017.0,2017.0
2,100663.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,...,1.0,111.0,1.0,111.0,1.0,111.0,1.0,111.0,2017.0,2017.0
3,100690.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,-2.0,-2.0,...,2.0,-2.0,2.0,-2.0,2.0,-2.0,2.0,-2.0,2017.0,2017.0
4,100706.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,...,2.0,-2.0,1.0,146.0,1.0,146.0,1.0,146.0,2017.0,2017.0
5,100724.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,...,1.0,133.0,1.0,133.0,1.0,133.0,1.0,133.0,2017.0,2017.0


In [33]:
a = IC(years=[2016,2017])

In [34]:
b = a.get()

In [35]:
type(b)

pandas.core.frame.DataFrame

In [39]:
b.columns

Index(['unitid', 'peo1istr', 'peo2istr', 'peo3istr', 'peo4istr', 'peo5istr',
       'peo6istr', 'cntlaffi', 'pubprime', 'pubsecon',
       ...
       'confno4 ', 'survey_year', 'fall_year', 'dstnugc', 'dstnugp', 'dstnugn',
       'dstngc', 'dstngp', 'dstngn', 'distpgs'],
      dtype='object', length=123)

In [40]:
b.survey_year.value_counts()

2016.0    7047
2017.0    6882
Name: survey_year, dtype: int64

In [41]:
b.head(1).T

Unnamed: 0,1
unitid,100654
peo1istr,0
peo2istr,1
peo3istr,0
peo4istr,0
peo5istr,0
peo6istr,0
cntlaffi,1
pubprime,2
pubsecon,0
