In [1]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
np.random.seed(42)
# Matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib
matplotlib.rcParams['font.size'] = 16
matplotlib.rcParams['figure.figsize'] = (9, 9)
import matplotlib.gridspec as gridspec
import matplotlib.pylab as pl

import seaborn as sns

from IPython.core.pylabtools import figsize

# Scipy helper functions
from scipy.stats import percentileofscore
from scipy import stats
# Standard ML Models for comparison
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

# Splitting data into training/testing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

# Distributions
import scipy
import warnings
warnings.filterwarnings('ignore')

# PyMC3 for Bayesian Inference
import pymc3 as pm

In [2]:
#Import both raw data frames into pandas
raw_df = pd.read_csv('Data/Raw/student-mat.csv', delimiter=';')
raw_df2 = pd.read_csv('Data/Raw/student-por.csv', delimiter=';')

In [3]:
#assign all of the dataframes as a variable for easier access
frames = [raw_df,raw_df2]

In [4]:
#combine the dataframes
#same features more instances
df = pd.concat(frames)

In [5]:
#save the merged dataframe as a csv into 
df.to_csv('Data/Processed/combined_raw.csv')

In [6]:
#drop students witch final grade of 0
df = df[~df.G3.isin([0,1])]

Previewing different parts of Data Frame

In [7]:
#develop a baseline neural network

import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [16]:
cat_cols = df[['school', 'sex','address', 'famsize', 'Pstatus',
             'Mjob', 'Fjob', 'reason', 'guardian','schoolsup', 
              'famsup', 'paid', 'activities', 'nursery','higher', 
              'internet', 'romantic',]]

In [17]:
df.dtypes

school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object

In [27]:
#create label encoders for categorical features
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [22]:
df_processed = pd.DataFrame()

In [23]:
# For each categorical column
# We fit a label encoder, transform our column and 
# add it to our new dataframe
label_encoders = {}
for col in cat_cols:
    print("Encoding {}".format(col))
    new_le = LabelEncoder()
    df_processed[col] = new_le.fit_transform(cat_cols[col])
    label_encoders[col] = new_le

Encoding school
Encoding sex
Encoding address
Encoding famsize
Encoding Pstatus
Encoding Mjob
Encoding Fjob
Encoding reason
Encoding guardian
Encoding schoolsup
Encoding famsup
Encoding paid
Encoding activities
Encoding nursery
Encoding higher
Encoding internet
Encoding romantic


In [25]:
cat_columns_idx = [df_processed.columns.get_loc(col) 
                   for col in cat_cols]

In [28]:
ohe = OneHotEncoder(categorical_features=cat_columns_idx, 
                    sparse=False, handle_unknown="ignore")
df_processed_np = ohe.fit_transform(df_processed)

In [32]:
df_test_processed = df[[col for col in df.columns 
                             if col not in cat_cols]]

In [35]:
for col in cat_cols:
    print("Encoding {}".format(col))
    label_map = {val: label for label, val in enumerate(label_encoders[col].classes_)}
    print(label_map)
    df_test_processed[col] = df[col].map(label_map)
    # fillna and convert to int
    df_test_processed[col] = df_test_processed[col].fillna(9999).astype(int)

Encoding school
{'GP': 0, 'MS': 1}
Encoding sex
{'F': 0, 'M': 1}
Encoding address
{'R': 0, 'U': 1}
Encoding famsize
{'GT3': 0, 'LE3': 1}
Encoding Pstatus
{'A': 0, 'T': 1}
Encoding Mjob
{'at_home': 0, 'health': 1, 'other': 2, 'services': 3, 'teacher': 4}
Encoding Fjob
{'at_home': 0, 'health': 1, 'other': 2, 'services': 3, 'teacher': 4}
Encoding reason
{'course': 0, 'home': 1, 'other': 2, 'reputation': 3}
Encoding guardian
{'father': 0, 'mother': 1, 'other': 2}
Encoding schoolsup
{'no': 0, 'yes': 1}
Encoding famsup
{'no': 0, 'yes': 1}
Encoding paid
{'no': 0, 'yes': 1}
Encoding activities
{'no': 0, 'yes': 1}
Encoding nursery
{'no': 0, 'yes': 1}
Encoding higher
{'no': 0, 'yes': 1}
Encoding internet
{'no': 0, 'yes': 1}
Encoding romantic
{'no': 0, 'yes': 1}


In [37]:
df_processed = ohe.transform(df_test_processed)

In [46]:
X = df_test_processed

KeyError: 'Grades'

In [49]:
Y = df.G3

In [51]:
Y.shape

(990,)

In [59]:
# define base model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(33, input_dim=33, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

In [60]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)

In [61]:
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Instructions for updating:
Use tf.cast instead.
Results: -0.00 (0.00) MSE


In [62]:
# evaluate model with standardized dataset
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Standardized: -0.14 (0.05) MSE


In [65]:
# define the model
def larger_model():
	# create model
	model = Sequential()
	model.add(Dense(33, input_dim=33, kernel_initializer='normal', activation='relu'))
	model.add(Dense(6, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

In [66]:
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=larger_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Larger: -0.16 (0.07) MSE


In [67]:
# define wider model
def wider_model():
	# create model
	model = Sequential()
	model.add(Dense(20, input_dim=33, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

In [68]:
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=wider_model, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Wider: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Wider: -0.05 (0.03) MSE
