In [1]:
import pandas as pd
import numpy as np
import os
import requests
import zipfile
import glob
import time

In [2]:
def read_survey(path):
    if isinstance(path, list):
        path = path[0]
    # assumes a path, presumably from zip_parser
    try:
        ## encoding option needed for h2017, at least, wasnt needed for IC2013
        survey_file = pd.read_csv(path, encoding='ISO-8859-1')
    except:
        survey_file = pd.DataFrame({'path':path})
    # remove the file
    os.remove(path)
    # column names to lower - helps later and assumes a survey varname is historically unique
    survey_file.columns = survey_file.columns.str.lower()
    # add the survey
    return(survey_file)

In [3]:
def get_ic(year):
    # assert that year is a int and length 1
    assert isinstance(year, int), "year is not an integer"
    assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017"
    # build the SURVEY id
    SURVEY = 'IC' + str(year)
    # build the url
    URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY)
    # return the bits as a dictionary for use later
    return({'url': URL, 'survey': SURVEY})



In [4]:
def get_adm(year):
    # assert that year is a int and length 1
    assert isinstance(year, int), "year is not an integer"
    assert year >= 2014 and year <= 2017, "year must be >=2014 and < 2017"
    # build the SURVEY id
    SURVEY = 'ADM' + str(year)
    # build the url
    URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY)
    # return the bits as a dictionary for use later
    return({'url': URL, 'survey': SURVEY})

In [11]:
year = 2017
year_info = get_adm(year)

In [12]:
url =  year_info['url']
survey = year_info['survey']

In [13]:
path = "/tmp/" + str(int(time.time())) + "/"  # hacky way to make unique path to extract time
file = survey + ".zip"
survey_lower = survey.lower()

In [14]:
# get the data
os.mkdir(path)
try:
  results = requests.get(url)
except:
  pass
with open(path + file, 'wb') as f:
    f.write(results.content)

In [15]:
file = zipfile.ZipFile(path + file)
file.extractall(path=path)

In [16]:
files = glob.glob(path + "*"+survey_lower+"*")

In [17]:
files

['/tmp/1557845558/adm2017.csv']

In [18]:
if len(files) > 1:
    raw_file = [s for s in files if 'rv' in s]
    raw_file = str(raw_file[0]) # just in case, take first
else:
    raw_file = str(files[0])

In [19]:
raw_file

'/tmp/1557845558/adm2017.csv'

In [20]:
str(raw_file)

'/tmp/1557845558/adm2017.csv'