In [None]:
"""

Create a list of the original column names used in the training DataFrame.
Extract the coefficients of the logistic regression estimator.
Create a DataFrame of coefficients and variable names & view it.
Print out the top 3 'positive' variables based on the coefficient size.

"""

# Create a list of original variable names from the training DataFrame
original_variables = list(X_train.columns)

# Extract the coefficients of the logistic regression estimator
model_coefficients = log_reg_clf.coef_[0]

# Create a dataframe of the variables and coefficients & print it out
coefficient_df = pd.DataFrame({"Variable" : original_variables, "Coefficient": model_coefficients})
print(coefficient_df)

# Print out the top 3 positive variables
top_three_df = coefficient_df.sort_values(by=["Coefficient"], axis=0, ascending=False)[0:3]
print(top_three_df)

"""
 Variable  Coefficient
0     LIMIT_BAL   -3.926e-06
1           AGE   -3.170e-06
2         PAY_0    2.189e-07
3         PAY_2    1.129e-07
4         PAY_3    1.110e-07
5         PAY_4    1.264e-07
6         PAY_5    1.291e-07
7         PAY_6    1.235e-07
8     BILL_AMT1   -7.001e-06
9     BILL_AMT2   -4.343e-06
10    BILL_AMT3    4.402e-06
11    BILL_AMT4    1.599e-05
12    BILL_AMT5    3.373e-06
13    BILL_AMT6   -2.527e-06
14     PAY_AMT1   -6.498e-05
15     PAY_AMT2   -9.547e-05
16     PAY_AMT3   -5.436e-05
17     PAY_AMT4   -3.596e-05
18     PAY_AMT5   -3.400e-05
19     PAY_AMT6    3.083e-06
20        SEX_2   -7.633e-08
21  EDUCATION_1   -7.142e-09
22  EDUCATION_2   -6.246e-08
23  EDUCATION_3   -2.333e-08
24  EDUCATION_4   -1.172e-09
25  EDUCATION_5   -2.403e-09
26  EDUCATION_6   -2.595e-10
27   MARRIAGE_1   -2.480e-08
28   MARRIAGE_2   -7.474e-08
29   MARRIAGE_3    2.855e-09
     Variable  Coefficient
11  BILL_AMT4    1.599e-05
10  BILL_AMT3    4.402e-06
12  BILL_AMT5    3.373e-06

"""

**Hyperparameters
 Overview**

In [None]:
"""

Print out the hyperparameters of the existing random forest classifier by printing the estimator and then create a confusion matrix and accuracy score from it.

Assess the performance of the new random forest classifier. Create the confusion matrix and accuracy score and print them out

"""

rf_clf_old = RandomForestClassifier()

# Print out the random forest hyperparameters
print(rf_clf_old)

# Get confusion matrix & accuracy for the old rf_model
print("Confusion Matrix: \n\n {} \n Accuracy Score: \n\n {}".format(
  	confusion_matrix(y_test, rf_old_predictions),
  	accuracy_score(y_test, rf_old_predictions)))

# Create a new random forest classifier with better hyperparamaters
rf_clf_new = RandomForestClassifier(n_estimators=500)

# Fit this to the data and obtain predictions
rf_new_predictions = rf_clf_new.fit(X_train, y_train).predict(X_test)

# Assess the new model (using new predictions!)
print("Confusion Matrix: \n\n", confusion_matrix(y_test, rf_new_predictions))
print("Accuracy Score: \n\n", accuracy_score(y_test, rf_new_predictions))

In [None]:
"""

Build a knn estimator for the following values of n_neighbors [5,10,20].
Fit each to the training data and produce predictions.
Get an accuracy score for each model and print them out.

"""

# Build a knn estimator for each value of n_neighbours
knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_20 = KNeighborsClassifier(n_neighbors=20)

# Fit each to the training data & produce predictions
knn_5_predictions = knn_5.fit(X_train, y_train).predict(X_test)
knn_10_predictions = knn_10.fit(X_train, y_train).predict(X_test)
knn_20_predictions = knn_20.fit(X_train, y_train).predict(X_test)

# Get an accuracy score for each of the models
knn_5_accuracy = accuracy_score(y_test, knn_5_predictions)
knn_10_accuracy = accuracy_score(y_test, knn_10_predictions)
knn_20_accuracy = accuracy_score(y_test, knn_20_predictions)
print("The accuracy of 5, 10, 20 neighbours was {}, {}, {}".format(knn_5_accuracy, knn_10_accuracy, knn_20_accuracy))

**Hyperparameter Values**

In [None]:
"""

Create a learning_rates list for the learning rates, and a results_list to hold the accuracy score of your predictions.
Write a loop to create a GBM model for each learning rate mentioned and create predictions for each model.
Save the learning rate and accuracy score to a results_list.
Turn the results list into a DataFrame and print it out.

"""

# Set the learning rates & results storage
learning_rates = [0.001,.01,.05,.1,.2,.5]
results_list = []

# Create the for loop to evaluate model predictions for each learning rate
for learning_rate in learning_rates:
    model = GradientBoostingClassifier(learning_rate=learning_rate)
    predictions = model.fit(X_train, y_train).predict(X_test)
    # Save the learning rate and accuracy score
    results_list.append([learning_rate, accuracy_score(y_test, predictions)])

# Gather everything into a DataFrame
results_df = pd.DataFrame(results_list, columns=['learning_rate', 'accuracy'])
print(results_df)

In [None]:
"""

Create a list of 30 learning rates evenly spread between 0.01 and 2.
Create a similar loop to last exercise but just save out accuracy scores to a list.
Plot the learning rates against the accuracy score.

"""

# Set the learning rates & accuracies list
learn_rates = np.linspace(.01, 2, num=30)
accuracies = []

# Create the for loop
for learn_rate in learn_rates:
  	# Create the model, predictions & save the accuracies as before
    model = GradientBoostingClassifier(learning_rate=learn_rate)
    predictions = model.fit(X_train, y_train).predict(X_test)
    accuracies.append(accuracy_score(y_test, predictions))

# Plot results
plt.plot(learn_rates, accuracies)
plt.gca().set(xlabel='learning_rate', ylabel='Accuracy', title='Accuracy for different learning_rates')
plt.show()