## Functions

In this notebook, we lay out functions that we will use in other notebooks to clean, sort, analyze and visualize data.

In [2]:
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
from scipy.interpolate import InterpolatedUnivariateSpline as Spline
import matplotlib as mpl
import seaborn as sns
import geopandas as gpd

In [3]:
sipp_2014_2021 = pd.read_csv(r"C:\Users\shell\Python stuff\SIPP data\SIPP_2014_2021.txt", index_col=0)
NPSAS = pd.read_csv(r"C:\Users\shell\Python stuff\NCES DataLab Data\datalab_completed_2015_16.csv")

  sipp_2014_2021 = pd.read_csv(r"C:\Users\shell\Python stuff\SIPP data\SIPP_2014_2021.txt", index_col=0)


Functions intended for use with the SIPP data:

In [5]:
def median(dataframe, independent, dependent, point):
    '''
    Finds the median value of the dependent variable at the point given for the independent variable
    '''
    df = dataframe
    df = df[df[independent] == point]
    if df[dependent].median() > 0:
        return df[dependent].median()
    else:
        return -1

In [6]:
def average(list):
    if len(list) != 0:
        return sum(list) / len(list)
    else:
        return 0

def list_avg(dataframe, independent, dependent, point):
    '''
    Finds the average value of the dependent variable at the point given for the independent variable
    '''
    df = dataframe
    df = df[df[independent] == point]
    if average(df[dependent]) > 0:
        return average(df[dependent])
    else:
        return -1

In [58]:
def mean_sort_by(dataframe, independent, dependent):
    '''
    Sorts a dataframe such that it compares the indpendent variable to the mean of the dependent variable
    '''
    df = dataframe
    df = df.fillna(-1)
    df[independent] = df[independent].astype('int')
    df = df[(df['MONTHCODE'] == 1) & (df['EEDUC'] == 43) & (df[dependent] > 0) & (df[independent] > 0)]
    new_df = pd.DataFrame()
    new_df[independent] = range(df[independent].min(), (df[independent].max()+1))
    
    total_list = []
    for x in range(df[independent].min(), (df[independent].max()+1)):
        total_df = df[df[independent] == x]
        total_list.append(sum(total_df[dependent]))
    new_df['total ' + dependent] = total_list
    
    percent_list = []
    for x in range(df[independent].min(), df[independent].max()+1):
        total = sum(new_df['total ' + dependent])
        x_df = new_df[new_df[independent] == x]
        percent_list.append((x_df['total ' + dependent].iloc[0] / total) * 100)
    new_df['percent ' + dependent] = percent_list
    
    median_list = []
    for point in new_df[independent]:
        median_list.append(list_avg(df, independent, dependent, point))

    new_df['mean ' + dependent] = median_list
    
    no_data_independent = []
    for x in range(len(new_df.index)):
        if len(df[(df[independent] == new_df[independent].iloc[x])].index) < 10:
            no_data_independent.append(x)
    new_df.drop(index=no_data_independent, axis='index', inplace=True)

    return new_df

In [12]:
def median_sort_by(dataframe, independent, dependent):
    '''
    Sorts a dataframe such that it compares the indpendent variable to the median of the dependent variable
    '''
    df = dataframe
    df = df.fillna(-1)
    df[independent] = df[independent].astype('int')
    df = df[(df['MONTHCODE'] == 1) & (df['EEDUC'] == 43) & (df[dependent] > 0) & (df[independent] > 0)]
    new_df = pd.DataFrame()
    new_df[independent] = range(df[independent].min(), (df[independent].max()+1))
    
    total_list = []
    for x in range(df[independent].min(), (df[independent].max()+1)):
        total_df = df[df[independent] == x]
        total_list.append(sum(total_df[dependent]))
    new_df['total ' + dependent] = total_list
    
    percent_list = []
    for x in range(df[independent].min(), (df[independent].max()+1)):
        total = sum(new_df['total ' + dependent])
        x_df = new_df[new_df[independent] == x]
        percent_list.append((x_df['total ' + dependent].iloc[0] / total) * 100)
    new_df['percent ' + dependent] = percent_list
    
    median_list = []
    for point in new_df[independent]:
        median_list.append(median(df, independent, dependent, point))

    new_df['median ' + dependent] = median_list
    
    no_data_independent = []
    for x in range(len(new_df.index)):
        if len(df[(df[independent] == new_df[independent].iloc[x])].index) < 10:
            no_data_independent.append(x)
    new_df.drop(index=no_data_independent, axis='index', inplace=True)
    
    new_df[new_df == 0] = np.nan
    new_df[new_df == -1] = np.nan
    
    return new_df

In [9]:
def mean_bar_graph(dataframe, independent, dependent, title, xlab, ylab):
    df = mean_sort_by(dataframe, independent, dependent)
    plt.bar(df[independent], df['median ' + dependent])
    plt.title(title)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.show()
    return plt.bar(df[independent], df['median ' + dependent])

In [10]:
def median_spline(dataframe, independent, dependent, title, xlab, ylab):

    dataframe = median_sort_by(dataframe, independent, dependent)
    dependent = 'median ' + dependent
    x=np.linspace(dataframe[independent].min(), dataframe[independent].max(), 1000)
    spl = Spline(dataframe[independent], dataframe[dependent])
    plt.plot(x, spl(x))
    
    # fractions have no practical use, they're used for graph cleanliness
    dependent_fraction = (dataframe[dependent].max() - dataframe[dependent].min())*0.05
    independent_fraction = (dataframe[independent].max() - dataframe[independent].min())*0.05
    
    plt.ylim(dataframe[dependent].min() - dependent_fraction, dataframe[dependent].max() + dependent_fraction)
    plt.xlim(dataframe[independent].min() - independent_fraction, dataframe[independent].max() + independent_fraction)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.title(title)
    
    plt.show()

In [11]:
def median_regression(dataframe, independent, dependent, title, xlab, ylab, degree):
    '''
    Graphs the given dataframe and it's polynomial regression. 
    Dataframe must contain only the dependent and independent columns.
    '''
    dataframe = median_sort_by(dataframe, independent, dependent)
    mdep = 'median ' + dependent
    dataframe = dataframe[dataframe[mdep] > 0]
    
    coefficients = np.polyfit(x=dataframe[independent], y=dataframe[mdep], deg=degree)
    function = np.poly1d(coefficients)
    
    x=np.linspace(dataframe[independent].min(), dataframe[independent].max(), 1000)
    y=function(x)
    
    # fractions have no practical use, they're used for graph cleanliness
    dependent_fraction = (dataframe[mdep].max() - dataframe[mdep].min())*0.05
    independent_fraction = (dataframe[independent].max() - dataframe[independent].min())*0.05
    
    plt.ylim(dataframe[mdep].min() - dependent_fraction, dataframe[mdep].max() + dependent_fraction)
    plt.xlim(dataframe[independent].min() - independent_fraction, dataframe[independent].max() + independent_fraction)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.title(title)
    
    #plt.plot(x, y)
    plt.scatter(x=dataframe[independent], y=dataframe[mdep],s=(dataframe['total ' + dependent] / 1000))
    
    plt.show()

In [16]:
def sipp_clean(df):
    df = df[(df['EEDUC'] == 43) & (df['MONTHCODE'] == 1)]
    return df

In [17]:
def group_age(dataframe):
    df = dataframe

    for x in range(len(df.index)):
        if df['TAGE'].iloc[x] < 24:
            df.iloc[x, df.columns.get_loc('TAGE')] = '<25'
        
        elif df['TAGE'].iloc[x] < 35:
            df.iloc[x, df.columns.get_loc('TAGE')] = '25 - 34'
        
        elif df['TAGE'].iloc[x] < 50:
            df.iloc[x, df.columns.get_loc('TAGE')] = '35 - 49'
        
        elif df['TAGE'].iloc[x] < 62:
            df.iloc[x, df.columns.get_loc('TAGE')] = '50 - 61'
        
        else:
            df.iloc[x, df.columns.get_loc('TAGE')] = '>61'
    return df

In [23]:
def age_percent_total(dataframe):
    df = dataframe[dataframe['TOEDDEBTVAL'] > 0]
    
    total = sum(df['TOEDDEBTVAL'])
    new_df = pd.DataFrame()
    new_df['TAGE'] = ['<25', '25 - 34', '35 - 49', '50 - 61', '>61']
    
    total_list = []
    for x in ['<25', '25 - 34', '35 - 49', '50 - 61', '>61']:
        total_df = df[df['TAGE'] == x]
        total_list.append((sum(total_df['TOEDDEBTVAL']) / total) * 100)
    new_df['percent of total debt'] = total_list
    return new_df

Functions intended for use with the NCES's NPSAS 2015-2016 data:

In [3]:
def datalab_round(dataframe):
    '''
    Rounds all values of the data to remove floating points
    '''
    df = dataframe
    df = df.fillna(-1)
    for x in range(len(df.index)):
        for y in range(2, len(df.columns)):
            df.iloc[x, y] = int(round(float(df.iloc[x, y]), 2))
    return df

In [16]:
def datalab_bar_graph(dataframe, category, zeroes, title, xlab, ylab):
    '''
    Graphs a category from the NPSAS data.
    Zeroes is a true or false value indicating
    whether or not to include zeroes in averages.
    '''
    df = dataframe
    values_df = df.iloc[:, range(2, len(df.columns))]
    values_df = values_df[values_df != -1]
    values_df = values_df[values_df != 0]
    df.iloc[:, range(2, len(df.columns))] = values_df
    # discards -1 and 0 values from rounding process, replacing them with NAN values that are automaatically ignored by Pandas 
    
    df = df[df['Category'] == category]
    if zeroes:
        plt.bar(df['Value'], df['Amount still owed on all undergraduate loans ( Average )'])
    elif not zeroes:
        plt.bar(df['Value'], df['Amount still owed on all undergraduate loans ( Average Without Zeros )'])
    
    plt.title(title)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.show

Functions intended for general use

In [15]:
def color_one(x, num, color):
    for x in range(x):
        if x == num:
            bars[x].set_color(color)
        else:
            bars[x].set_color('gray')