# Student Loan Risk with Deep Learning

In [1]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

---

## Prepare the data to be used on a neural network model

### Step 1: Read the `student_loans.csv` file into a Pandas DataFrame. Review the DataFrame, looking for columns that could eventually define your features and target variables.   

In [2]:
# Read the csv into a Pandas DataFrame
file_path = "https://static.bc-edx.com/mbc/ai/m6/datasets/student_loans.csv"
csv = pd.read_csv(file_path)

# Review the DataFrame
print(csv.head())

# or

csv.head()

   payment_history  location_parameter  stem_degree_score  gpa_ranking  \
0              7.4                0.70               0.00          1.9   
1              7.8                0.88               0.00          2.6   
2              7.8                0.76               0.04          2.3   
3             11.2                0.28               0.56          1.9   
4              7.4                0.70               0.00          1.9   

   alumni_success  study_major_code  time_to_completion  \
0           0.076              11.0                34.0   
1           0.098              25.0                67.0   
2           0.092              15.0                54.0   
3           0.075              17.0                60.0   
4           0.076              11.0                34.0   

   finance_workshop_score  cohort_ranking  total_loan_score  \
0                  0.9978            3.51              0.56   
1                  0.9968            3.20              0.68   
2          

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
# Review the data types associated with the columns
data_types = csv.dtypes

print(data_types)

payment_history           float64
location_parameter        float64
stem_degree_score         float64
gpa_ranking               float64
alumni_success            float64
study_major_code          float64
time_to_completion        float64
finance_workshop_score    float64
cohort_ranking            float64
total_loan_score          float64
financial_aid_score       float64
credit_ranking              int64
dtype: object


### Step 2: Using the preprocessed data, create the features (`X`) and target (`y`) datasets. The target dataset should be defined by the preprocessed DataFrame column “credit_ranking”. The remaining columns should define the features dataset.

In [4]:
# Define the target set y using the credit_ranking column
y = csv['credit_ranking']

# Define the features set X using all columns except 'credit_ranking'
X = csv.drop(columns=['credit_ranking'])

# Display a sample of y
print(y.head())

0    5
1    5
2    5
3    6
4    5
Name: credit_ranking, dtype: int64


In [5]:
# Define features set X by selecting all columns but 'credit_ranking'
X = csv.drop(columns=['credit_ranking'])

# Review the features DataFrame
print(X.head())

# or

X.head() 

   payment_history  location_parameter  stem_degree_score  gpa_ranking  \
0              7.4                0.70               0.00          1.9   
1              7.8                0.88               0.00          2.6   
2              7.8                0.76               0.04          2.3   
3             11.2                0.28               0.56          1.9   
4              7.4                0.70               0.00          1.9   

   alumni_success  study_major_code  time_to_completion  \
0           0.076              11.0                34.0   
1           0.098              25.0                67.0   
2           0.092              15.0                54.0   
3           0.075              17.0                60.0   
4           0.076              11.0                34.0   

   finance_workshop_score  cohort_ranking  total_loan_score  \
0                  0.9978            3.51              0.56   
1                  0.9968            3.20              0.68   
2          

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


### Step 3: Split the features and target sets into training and testing datasets.


In [6]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


### Step 4: Use scikit-learn's `StandardScaler` to scale the features data.

In [7]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
scaler.fit(X_train)

# Transform both the training and testing features datasets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


---

## Compile and Evaluate a Model Using a Neural Network

### Step 1: Create a deep neural network by assigning the number of input features, the number of layers, and the number of neurons on each layer using Tensorflow’s Keras.

> **Hint** You can start with a two-layer deep neural network model that uses the `relu` activation function for both layers.


In [8]:
# Define the the number of inputs (features) to the model
input_dim = X_train_scaled.shape[1]

# Review the number of features
print("Number of input features:", input_dim)

Number of input features: 11


In [9]:
# Define the number of neurons in the output layer
num_classes = 1

In [10]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 = 64

# Review the number hidden nodes in the first layer
print("Number of hidden nodes in the first layer:", hidden_nodes_layer1)

Number of hidden nodes in the first layer: 64


In [11]:
# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 = 32

# Review the number hidden nodes in the second layer
print("Number of hidden nodes in the second layer:", hidden_nodes_layer2)

Number of hidden nodes in the second layer: 32


In [12]:
# Create the Sequential model instance
model = Sequential()

In [13]:
# Add the first hidden layer
model.add(Input(shape=(input_dim,)))
model.add(Dense(hidden_nodes_layer1, activation='relu'))

# Error received
# UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` 
# object as the first layer in the model instead.
# super().__init__(activity_regularizer=activity_regularizer, **kwargs)

# Resolved by importing "Input" from keras.layers to create an Input object to pass 
# instead of using input_shape as a variable.

In [14]:
# Add the second hidden layer
model.add(Dense(hidden_nodes_layer2, activation='relu'))

In [15]:
# Add the output layer to the model specifying the number of output neurons and activation function
model.add(Dense(num_classes, activation='sigmoid'))

In [16]:
# Display the Sequential model summary
model.summary()

### Step 2: Compile and fit the model using the `mse` loss function, the `adam` optimizer, and the `mse` evaluation metric.


In [17]:
# Compile the Sequential model
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])


In [18]:
# Fit the model using 50 epochs and the training data
history = model.fit(X_train_scaled, y_train, epochs=50, verbose=1)


Epoch 1/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 744us/step - loss: 26.7265 - mean_squared_error: 26.7265 
Epoch 2/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 656us/step - loss: 23.0597 - mean_squared_error: 23.0597
Epoch 3/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 802us/step - loss: 22.4095 - mean_squared_error: 22.4095
Epoch 4/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 622us/step - loss: 22.6024 - mean_squared_error: 22.6024
Epoch 5/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 586us/step - loss: 22.3245 - mean_squared_error: 22.3245
Epoch 6/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 640us/step - loss: 22.4463 - mean_squared_error: 22.4463
Epoch 7/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 581us/step - loss: 22.0507 - mean_squared_error: 22.0507
Epoch 8/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

### Step 3: Evaluate the model using the test data to determine the model’s loss and accuracy.


In [19]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
loss, accuracy = model.evaluate(X_test_scaled, y_test)

# Display the model loss and accuracy results
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 838us/step - loss: 21.7782 - mean_squared_error: 21.7782
Test Loss: 21.528409957885742
Test Accuracy: 21.528409957885742


### Step 4: Save and export your model to an HDF5 file, and name the file `student_loans.h5`.


In [20]:
# Set the model's file path
model_file_path = "student_loans.h5"

# Export your model to a HDF5 file
model.save(model_file_path)





# Error recieved
# WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. 
# This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` 
# or `keras.saving.save_model(model, 'my_model.keras')`.



# This error doesn't seem to have an impact on the performance, leaving as HDF5 file to adhear to instructions
# Here is the code block in the format recommended through error:

# model_file_path = "student_loans.keras"
# model.save(model_file_path)



---
## Predict Loan Repayment Success by Using your Neural Network Model

### Step 1: Reload your saved model.

In [21]:
# Set the model's file path
model_file_path = "student_loans.h5"

# Load the model to a new object
loaded_model = tf.keras.models.load_model(model_file_path)

# loaded_model.compile(optimizer='adam',
#                      loss='binary_crossentropy',
#                      metrics=['accuracy'])
# # Error received:
# # WARNING:absl:Compiled the loaded model, but the compiled metrics have yet to be built. 
# # `model.compile_metrics` will be empty until you train or evaluate the model.

# # In attempts to fix the error:
# # Compiling the loaded model with the same metrics as before
# loaded_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

# # Evaluating the model to build the compiled metrics
# loss, accuracy = loaded_model.evaluate(X_test_scaled, y_test)
# loss, mean_squared_error = loaded_model.evaluate(X_test_scaled, y_test)

# print("Loss:", loss)
# print("Mean Squared Error:", mean_squared_error)
# print("Accuracy:", accuracy)

# Cannot resolve: 
# WARNING:absl:Compiled the loaded model, but the compiled metrics have yet to be built.
# `model.compile_metrics` will be empty until you train or evaluate the model.



### Step 2: Make predictions on the testing data.

In [22]:
# Make predictions on the testing data
predictions = loaded_model.predict(X_test_scaled)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 733us/step


### Step 3: Create a DataFrame to compare the predictions with the actual values.

In [23]:
# Create a DataFrame to compare the predictions with the actual values
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions.flatten()})

### Step 4: Display a sample of the DataFrame you created in step 3.3.

In [24]:
# Display sample data
print(predictions_df.head())
predictions_df

      Actual  Predicted
75         5   0.999977
1283       6   0.998642
408        6   0.999992
1281       6   0.999944
1118       6   1.000000


Unnamed: 0,Actual,Predicted
75,5,0.999977
1283,6,0.998642
408,6,0.999992
1281,6,0.999944
1118,6,1.000000
...,...,...
890,5,0.999988
146,5,1.000000
1551,5,1.000000
1209,7,0.999993
