In [1]:
import numpy as np
import pandas as pd
import pandas_profiling
import os
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#from matplotlib import scatter_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from zipfile import ZipFile

## Create function to Read in Data

In [2]:
def read_in_CollegeScorecard(columns):
    """
    Read in columns from files from inside the zip file. 
    Also assign a year to each DataFrame.
    
    Parameters:
    -----------
    columns: list,
        columns is a list of strings matching the desired column headers
        
    Returns:
    --------
    sheets: dictionary,
        sheets is a dictionary of year:DataFrame pairs
    """
    
    zip_file = ZipFile('CollegeScorecard_Raw_Data.zip')
    
    sheets = {}
    for year in range(1996, 2018):
        acyear = str(year)+'_'+str(year+1)[-2:]
        sheets[year] = pd.read_csv(zip_file.open('CollegeScorecard_Raw_Data/MERGED'+acyear+'_PP.csv'), usecols=columns)
        sheets[year]['YEAR'] = year
        sheets[year]['YEAR'] = pd.to_datetime(sheets[year]['YEAR'], format='%Y')
    return sheets

In [3]:
columns = ['INSTNM', 'HIGHDEG', 'CONTROL', 'REGION', 'LOCALE', 'LO_INC_DEBT_N', 'MD_INC_DEBT_N', 'HI_INC_DEBT_N',
           'LOAN_EVER', 'PELL_EVER', 'PCTPELL', 'ICLEVEL', 'CURROPER', 'TUITFTE', 'CDR3', 'INEXPFTE']
sheets = read_in_CollegeScorecard(columns)

## Create concatenated df of all sheets

In [4]:
def concatenate_all_sheets(sheets):
    """
    Concatenates DataFrames in a dictionary of DataFrames.
    
    Parameters:
    -----------
    sheets: dictionary,
        key value pairs are year and DataFrame associated to that year
        
    Returns:
    --------
    full_df: DataFrame
    """
    for year, df in sheets.items():
        df['iyear'] = df['YEAR']
        if year==1996:
            full_df = df.set_index([df.index, 'iyear'])
        else:
            full_df = pd.concat([full_df, df.set_index([df.index, 'iyear'])])
    return full_df

In [5]:
full_df = concatenate_all_sheets(sheets)
full_df.shape

(154228, 17)

## Back Fill huge DataFrame

In [8]:
def back_fill_from_year(df, year):
    fill_values = {}
    for name in df.INSTNM.unique():
        fill_values[name] = {'LOCALE': df.loc[(df.INSTNM==name)].LOCALE.values[0], 
                             'CURROPER': df.loc[(df.INSTNM==name)].CURROPER.values[0],
                             'CONTROL': df.loc[(df.INSTNM==name)].CONTROL.values[0]}
    
    for name in df.loc[df.YEAR!=year].INSTNM.unique():
        fill_values[name] = fill_values.get(name, {'LOCALE': np.NaN, 'CURROPER': np.NaN, 'CONTROL': np.NaN})
    
    df.LOCALE = df.INSTNM.map(lambda name: fill_values[name]['LOCALE'])
    df.CURROPER = df.INSTNM.map(lambda name: fill_values[name]['CURROPER'])
    df.CONTROL = df.INSTNM.map(lambda name: fill_values[name]['CONTROL'])
    
    return df

In [250]:
print(pd.concat([sheets[2017], sheets[2016]]).info())
back_fill_from_year(pd.concat([sheets[2017], sheets[2016]]), 2017).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14233 entries, 0 to 7174
Data columns (total 18 columns):
INSTNM           14233 non-null object
HIGHDEG          14233 non-null int64
CONTROL          14233 non-null int64
REGION           14233 non-null int64
LOCALE           6614 non-null float64
CURROPER         7058 non-null float64
TUITFTE          13322 non-null float64
INEXPFTE         13323 non-null float64
PCTPELL          12718 non-null float64
CDR3             12307 non-null float64
LO_INC_DEBT_N    7156 non-null object
MD_INC_DEBT_N    7156 non-null object
HI_INC_DEBT_N    7156 non-null object
LOAN_EVER        6976 non-null object
PELL_EVER        6976 non-null object
ICLEVEL          14233 non-null int64
YEAR             14233 non-null datetime64[ns]
iyear            14233 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(6), int64(4), object(6)
memory usage: 2.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 14233 entries, 0 to 7174
Data columns (tota

## Clean full_df

In [6]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 154228 entries, (0, 1996-01-01 00:00:00) to (7057, 2017-01-01 00:00:00)
Data columns (total 17 columns):
INSTNM           154228 non-null object
HIGHDEG          154228 non-null int64
CONTROL          154205 non-null float64
REGION           154227 non-null float64
LOCALE           6614 non-null float64
CURROPER         7058 non-null float64
TUITFTE          133780 non-null float64
INEXPFTE         133775 non-null float64
PCTPELL          67384 non-null float64
CDR3             47815 non-null float64
LO_INC_DEBT_N    137486 non-null object
MD_INC_DEBT_N    137486 non-null object
HI_INC_DEBT_N    137486 non-null object
LOAN_EVER        137342 non-null object
PELL_EVER        137342 non-null object
ICLEVEL          154205 non-null float64
YEAR             154228 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(9), int64(1), object(6)
memory usage: 20.5+ MB


In [7]:
sheets[2017].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7058 entries, 0 to 7057
Data columns (total 18 columns):
INSTNM           7058 non-null object
HIGHDEG          7058 non-null int64
CONTROL          7058 non-null int64
REGION           7058 non-null int64
LOCALE           6614 non-null float64
CURROPER         7058 non-null int64
TUITFTE          6593 non-null float64
INEXPFTE         6593 non-null float64
PCTPELL          6291 non-null float64
CDR3             6055 non-null float64
LO_INC_DEBT_N    0 non-null float64
MD_INC_DEBT_N    0 non-null float64
HI_INC_DEBT_N    0 non-null float64
LOAN_EVER        0 non-null float64
PELL_EVER        0 non-null float64
ICLEVEL          7058 non-null int64
YEAR             7058 non-null datetime64[ns]
iyear            7058 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(10), int64(5), object(1)
memory usage: 992.6+ KB


## Make Logistic Model

In [None]:
df.dropna(subset=['CDR3', 'CONTROL'], inplace=True)
df = df.replace('PrivacySuppressed', np.NaN)
df.dropna(subset=['LO_INC_DEBT_N', 'MD_INC_DEBT_N', 'HI_INC_DEBT_N'], inplace=True)
df['Public_or_Private'] = df.CONTROL.map({2:0, 1:1, 3:1})

In [79]:
def create_logistic_regression(df, predictor_columns, predicted_column):
    """
    Creates a logistic regression from input column names to predictor column
    
    Parameters
    ----------
    df: pandas dataframe
        the dataframe's columns should include the predictor_columns and predicted_column
    
    predictor_columns: list
        Should be a subset of columns from df. 
        Should have empty intersection with predicted_column
    
    predicted_column: string
        Should be an element in the list of columns from df. 
        Should not be included in predictor_columns
        
    Returns
    -------
    logreg: logistic regression already trained on training data from predictor columns
    
    Note: This function will change the given dataframe
    """
    
    # Create dataframes( or series) with predictors and  predicted values
    X = df[predictor_columns]
    y = df[predicted_column]
    
    # Scale the data using Robust Scaler
    scale = RobustScaler()
    transformed = scale.fit_transform(X)
    X = pd.DataFrame(transformed, columns = X.columns)
    
    # Create Train and Test Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    
    # Create a logistic regression model
    logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='lbfgs')
    
    # Fit the model to the training data
    try:
        model_log = logreg.fit(X_train, y_train)
    except:
        model_log = logreg.fit(np.array(X_train).reshape(-1,1), y_train)
        
    # Add new columns to the given data frame with predicted values and probability of correct predictions
    try:
        df['Predicted_'+predicted_column] = logreg.predict(X)
        df['ProbCorrect_Predicted_'+predicted_column] = logreg.predict_proba(X)[:,1]
    except:
        df['Predicted_'+predicted_column] = logreg.predict(np.array(X).reshape(-1,1))
        df['ProbCorrect_Predicted_'+predicted_column] = logreg.predict_proba(np.array(X).reshape(-1,1))[:,1]
    
    return logreg

In [80]:
# Test create_logistic_regression

create_logistic_regression(df, ['CDR3', 'HI_INC_DEBT_N'], 'Public_or_Private'
                          ).score(df[['CDR3', 'HI_INC_DEBT_N']], df['Public_or_Private'])

0.7426352841647279