# Data Modeling Assignment
### Brayan Gutierrez, Katie To, Jericka Ledezma
***

## Logistic Regression Model
### Created by: Katie To
referenced: https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8, 
https://towardsdatascience.com/logistic-regression-model-tuning-with-scikit-learn-part-1-425142e01af5
***

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

### Data Reading and Extracting

In [None]:
# Reading Dataset
ground_water = pd.read_csv("ground_water_quality_2022_post.csv")
ground_water_df = pd.DataFrame(ground_water)
ground_water_df = ground_water_df.dropna()

ground_water_df = ground_water_df.rename(columns = {'NO3 ': 'NO3'})
ground_water_df = ground_water_df.rename(columns = {'RSC  meq  / L': 'RSC'})
numeric = ground_water_df.select_dtypes(include=['number']).columns
nonnumeric = ground_water_df.select_dtypes(exclude=['number']).columns

In [None]:
# Outlier Treatment

for num in numeric:
    Q1 = ground_water_df[num].quantile(0.25)
    Q3 = ground_water_df[num].quantile(0.75)
    IQR = Q3 - Q1
    whisker_width = 1.5
    lower_whisker = Q1 -(whisker_width*IQR)
    upper_whisker = Q3 + (whisker_width*IQR)
    ground_water_df[num]=np.where(ground_water_df[num]>upper_whisker,upper_whisker,np.where(ground_water_df[num]<lower_whisker,lower_whisker,ground_water_df[num]))

In [None]:
# Standardizing Data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(ground_water_df[numeric])

ground_water_df[numeric] = scaled_data

ground_water_df[numeric].info()

In [None]:
# Extracting Chosen Feature to Dataframe
working_gw_df = ground_water_df[['RSC', 'SAR', 'Na', 'E.C', 'TDS', 'HCO3', 'pH', 'mandal', 'village', 'Classification.1']]
working_gw_df = working_gw_df.replace('MR', 'U.S.')

# Changing the Classification.1 to Numeric Valyes (P.S. = 0, U.S. = 1)
le = LabelEncoder()
working_gw_df['Classification.1']= le.fit_transform(working_gw_df['Classification.1']) 

# One-hot encode 'mandal' and 'village' columns separately
mandal_dummies = pd.get_dummies(working_gw_df['mandal'], prefix='mandal')
village_dummies = pd.get_dummies(working_gw_df['village'], prefix='village')

# Concatenate the encoded columns with the original DataFrame
working_gw_df_encoded = pd.concat([working_gw_df.drop(['mandal', 'village'], axis=1), mandal_dummies, village_dummies], axis=1)

# Display the modified DataFrame
working_gw_df_encoded.head()

In [None]:
# Splitting Data
X = working_gw_df_encoded.drop(['Classification.1'], axis = 1)
y = working_gw_df_encoded['Classification.1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)

sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [None]:
# parameter finetuning with grid search
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_res, y_train_res)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_res, y_train_res)

y_pred_test = logreg.predict(X_test)
y_pred_train = logreg.predict(X_train_res)

In [None]:
# Accuracy Test
accuracy = accuracy_score(y_test, y_pred_test)
print('Accuracy of Logistic Regression classifier: {:.2f}'.format(accuracy))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test, labels=logreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['P.S.', 'U.S.'])
disp.plot().figure_.savefig('LOG_CM.png')
correct = cm[0][0] + cm[1][1]
incorrect = cm[0][1] + cm [1][0]
print('Correctly Classified:', correct)
print('Incorrectly Classified:', incorrect)

In [None]:
# Classification Report
print(classification_report(y_test, y_pred_test))
print(classification_report(y_train_res, y_pred_train))

In [None]:
# ROC Curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

## Decision Tree Model
### Created by: Jericka Ledezma
***

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import cross_val_score,cross_val_predict, StratifiedKFold, train_test_split
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.tree import export_graphviz
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from six import StringIO  
from IPython.display import Image  
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import pydotplus

In [None]:
ground_water = pd.read_csv("ground_water_quality_2022_post.csv")
ground_water_df = pd.DataFrame(ground_water)
ground_water_df = ground_water_df.dropna()
ground_water_df['Classification.1'] = ground_water_df['Classification.1'].replace('MR', 'U.S.')
print(ground_water_df)

In [None]:
ground_water_df['pH'] = pd.to_numeric(ground_water_df['pH'], errors='coerce')
ground_water_df.info(verbose=True)

In [None]:
ground_water_df = ground_water_df.rename(columns = {'NO3 ': 'NO3'})
ground_water_df = ground_water_df.rename(columns = {'RSC  meq  / L': 'RSC'})
numeric = ground_water_df.select_dtypes(include=['number']).columns
nonnumeric = ground_water_df.select_dtypes(exclude=['number']).columns
cv = StratifiedKFold
ground_water_df.head()

In [None]:
#split dataset in features and target variable

feature_cols = ['RSC', 'SAR', 'T.H', 'Ca', 'Na', 'Mg', 'HCO3', 'mandal', 'village', 'E.C', 'TDS']# , 'RSC meq / L' , 'NO3' */
X = ground_water_df[feature_cols] # Features
y = ground_water_df['Classification.1'] # Target variable (could be 'Classification' or 'Classification.1')
mandal_dummies = pd.get_dummies(X['mandal'], prefix='mandal')
village_dummies = pd.get_dummies(X['village'], prefix='village')

# Perform one-hot encoding using get_dummies() on the 'village' and 'mandal' columns
#X_encoded = pd.get_dummies(X, columns=['village', 'mandal'], drop_first=True)  # Encoding 'village' and 'mandal' columns and dropping the original columns

# Drop the original 'village' and 'mandal' columns from X_encoded
working_gw_df_encoded = pd.concat([X.drop(['mandal', 'village'], axis=1), mandal_dummies, village_dummies], axis=1)

X = working_gw_df_encoded

working_gw_df_encoded.head()
# Split dataset into training set and test set
#you get to use all data to train model
#check out to use kfold, use accuracy for attribute scoring

### Outlier Treatment

In [None]:
for num in numeric:
    Q1 = ground_water_df[num].quantile(0.25)
    Q3 = ground_water_df[num].quantile(0.75)
    IQR = Q3 - Q1
    whisker_width = 1.5
    lower_whisker = Q1 -(whisker_width*IQR)
    upper_whisker = Q3 + (whisker_width*IQR)
    ground_water_df[num]=np.where(ground_water_df[num]>upper_whisker,upper_whisker,np.where(ground_water_df[num]<lower_whisker,lower_whisker,ground_water_df[num]))

In [None]:
num_rows = len(ground_water_df)

for col in ground_water_df.columns:
    cnts = ground_water_df[col].value_counts(dropna=False)
    top_pct = (cnts/num_rows).iloc[0]
    
    if top_pct > 0.999:
        print('{0}: {1:.2f}%'.format(col, top_pct*100))
        print(cnts)
        print()

### Decision Tree Fitting, Cross Validation, and Evaluation Metric #1 (Confusion Matrices)

In [None]:
# Split the dataset into features (X) and target variable (y)
X =  working_gw_df_encoded
y = ground_water_df['Classification.1']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Decision Tree classifer object
#this is where you specify the depth (change based on chart given in document [3,7,11,15])
clf = DecisionTreeClassifier(max_depth = 15)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
# Create Decision Tree classifer object
#this is where you specify the depth (change based on chart given in document [3,7,11,15])
tree_depth = [3,7,11,15]
exp_matrix =[[],[],[]]
for i in tree_depth:
    clf = DecisionTreeClassifier(max_depth = i)
    # Train Decision Tree Classifer
    clf = clf.fit(X_train,y_train)
    accuracy = cross_val_score(clf, X, y, scoring='accuracy', cv = 5)
    # precision = cross_val_score(clf, X, y, scoring='precision', cv = 5)
    # recall = cross_val_score(clf, X, y, scoring='recall', cv = 5)
    print("Depth for Training: ",i)
    print("Accuracy for Training:",accuracy.mean() * 100)
    # print("Precision:",precision.mean() *100)
    # print("Recall:",recall.mean() *100)
    
    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Create the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Display the confusion matrix
    print(f"Testing Confusion Matrix Depth {i}:")
    print(conf_matrix)

In [None]:
# Get the unique classes in the target variable 'Classification.1'
class_names = y.unique().astype(str)

# Visualize the Decision Tree with the corrected class names
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names=X.columns, class_names=class_names)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ds.png')
Image(graph.create_png())

### Displaying 1 Confusion Matrix

In [None]:
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_matrix, display_labels = ["P.S.", 'U.S.'])

cm_display.plot()
plt.savefig('Decision Tree CM.png')
plt.show()

### Evaluation Metric #2: Precision, Recall, and F1 Score

In [None]:
# Assuming 'y_test' and 'y_pred' are the true labels and predicted labels respectively

# Generate a classification report with precision, recall, F1-score, support, and averaging options
report = classification_report(y_test, y_pred, target_names=['P.S.', 'U.S.'], output_dict=True)

# Create a DataFrame from the classification report data
import pandas as pd
classification_df = pd.DataFrame(report).transpose()

# Display the classification report DataFrame
print(classification_df)

## Random Forest Model
### Created by: Brayan Gutierrez
***

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.tree import export_graphviz
from IPython.display import Image
from sklearn.preprocessing import StandardScaler
import graphviz
import random
from scipy.stats import randint
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.tree import plot_tree

### Data Reading and Extracting

In [None]:
# Reading Dataset
ground_water = pd.read_csv("ground_water_quality_2022_post.csv")
ground_water_df = pd.DataFrame(ground_water)
ground_water_df = ground_water_df.dropna()

ground_water_df = ground_water_df.rename(columns = {'NO3 ': 'NO3'})
ground_water_df = ground_water_df.rename(columns = {'RSC  meq  / L': 'RSC'})
numeric = ground_water_df.select_dtypes(include=['number']).columns
nonnumeric = ground_water_df.select_dtypes(exclude=['number']).columns

# Outlier Treatment
for num in numeric:
    Q1 = ground_water_df[num].quantile(0.25)
    Q3 = ground_water_df[num].quantile(0.75)
    IQR = Q3 - Q1
    whisker_width = 1.5
    lower_whisker = Q1 -(whisker_width*IQR)
    upper_whisker = Q3 + (whisker_width*IQR)
    ground_water_df[num]=np.where(ground_water_df[num]>upper_whisker,upper_whisker,np.where(ground_water_df[num]<lower_whisker,lower_whisker,ground_water_df[num]))

In [None]:
# Extracting Chosen Feature to Dataframe
chosen = ['RSC', 'SAR', 'Na', 'E.C', 'TDS', 'HCO3', 'pH', 'mandal', 'village', 'Classification.1']
working_gw_df = ground_water_df[chosen]
working_gw_df = working_gw_df.replace('MR', 'U.S.')

# One-hot encode 'mandal' and 'village' columns separately
mandal_dummies = pd.get_dummies(working_gw_df['mandal'], prefix='mandal')
village_dummies = pd.get_dummies(working_gw_df['village'], prefix='village')

# Concatenate the encoded columns with the original DataFrame
working_gw_df_encoded = pd.concat([working_gw_df.drop(['mandal', 'village'], axis=1), mandal_dummies, village_dummies], axis=1)
working_gw_df_encoded = working_gw_df_encoded.replace('MR', 'U.S.')

# Display the modified DataFrame
working_gw_df_encoded.head()

### Random Forest Algorithm Variants

#### 1. Cross Validating Best Hyperparameters all Features

In [None]:
random.seed(10)
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

# Create a random forest classifier
rf = RandomForestClassifier(random_state = 10)

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5)

# Fit the random search object to the data
rand_search.fit(X_train, y_train)

In [None]:
random.seed(10)
# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

In [None]:
random.seed(10)
# Random Forest
rf = RandomForestClassifier(n_estimators = rand_search.best_params_['n_estimators'], max_depth = rand_search.best_params_['max_depth'], random_state = 10)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

In [None]:
# Accuracy Test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
random.seed(10)
for i in range(3):
    tree = rf.estimators_[i]
    
    plt.figure(figsize=(10,8))
    plot_tree(tree, feature_names=X_train.columns, class_names=['P.S.', 'U.S.'], filled=True, max_depth=4)
    
    if i == 1:
        plt.savefig('All Feature CV.png')
    
    plt.show()

In [None]:
# Confusion Matrix
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis = 1)[:, np.newaxis]

plt.figure(figsize =  (10,5))
sns.set(font_scale = 1.4)
sns.heatmap(matrix, annot=True, annot_kws = {'size':10}, cmap = plt.cm.Greens, linewidths = 0.2)

class_names = ['P.S.', 'U.S.']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for All Features Random Forest Model (Cross Validation)')
plt.savefig('Confusion Matrix for All Features Random Forest Model CV.png')
plt.show()

In [None]:
# Precision, Recall, and F1 Score
print(classification_report(y_test, y_pred))

#### 2. Out of Bag Evaluation All Features

In [None]:
random.seed(10)
# Splitting Data
X = working_gw_df_encoded.drop('Classification.1', axis = 1)
y = working_gw_df_encoded['Classification.1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)

# Random Forest
rf = RandomForestClassifier(oob_score = True, random_state = 10)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

# Accuracy Test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

for i in range(3):
    tree = rf.estimators_[i]
    
    plt.figure(figsize=(10,8))
    plot_tree(tree, feature_names=X_train.columns, class_names=['P.S.', 'U.S.'], filled=True, max_depth=4)
    
    if i == 1:
        plt.savefig('All Feature OOB.png')
    
plt.show()

In [None]:
# Confusion Matrix
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis = 1)[:, np.newaxis]

plt.figure(figsize =  (10,5))
sns.set(font_scale = 2)
sns.heatmap(matrix, annot=True, annot_kws = {'size':10}, cmap = plt.cm.Greens, linewidths = 0.2)

class_names = ['P.S.', 'U.S.']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for All Features Random Forest Model (OOB)')
plt.savefig('Confusion Matrix for All Features Random Forest Model OOB.png')
plt.show()

In [None]:
# Precision, Recall, and F1 Score
print(classification_report(y_test, y_pred))

#### 3. Cross Validating Best Hyperparameters Numeric Features

In [None]:
random.seed(10)
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

# Create a random forest classifier
rf = RandomForestClassifier(random_state = 10)

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5)

# Fit the random search object to the data
rand_search.fit(X_train, y_train)

# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

In [None]:
random.seed(10)
# Random Forest
rf = RandomForestClassifier(n_estimators = rand_search.best_params_['n_estimators'], max_depth = rand_search.best_params_['max_depth'], random_state = 10)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

# Accuracy Test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

for i in range(3):
    tree = rf.estimators_[i]
    
    plt.figure(figsize=(10,8))
    plot_tree(tree, feature_names=X_train.columns, class_names=['P.S.', 'U.S.'], filled=True, max_depth=4)
    
    if i == 1:
        plt.savefig('Numeric Features CV.png')
    
    plt.show()

In [None]:
# Aggregate feature importances
importances = np.mean([tree.feature_importances_ for tree in best_rf.estimators_], axis=0)

# Sort features by importance
sorted_indices = np.argsort(importances)[::-1]
sorted_importances = importances[sorted_indices]
sorted_features = X_train.columns[sorted_indices]

# Create bar plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_importances)), sorted_importances, align='center')
plt.xticks(range(len(sorted_importances)), sorted_features, rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importances CV')
plt.tight_layout()
plt.savefig('Feature Importances CV.png')
plt.show()

In [None]:
# Confusion Matrix
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis = 1)[:, np.newaxis]

plt.figure(figsize =  (10,5))
sns.set(font_scale = 1.4)
sns.heatmap(matrix, annot=True, annot_kws = {'size':10}, cmap = plt.cm.Greens, linewidths = 0.2)

class_names = ['P.S.', 'U.S.']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Numerical Features Random Forest Model (Cross Validated)')
plt.savefig('Confusion Matrix for Numerical Features Random Forest Model CV.png')
plt.show()

In [None]:
# Precision, Recall, and F1 Score
print(classification_report(y_test, y_pred))

#### 4. Out of Bag Evaluation Numeric Features

In [None]:
random.seed(10)
chosen_numeric = ['RSC', 'SAR', 'Na', 'E.C', 'TDS', 'HCO3', 'pH', 'Classification.1']
X = working_gw_df_encoded[chosen_numeric].drop('Classification.1', axis = 1)
y = working_gw_df_encoded['Classification.1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)

# Random Forest
rf = RandomForestClassifier(oob_score = True, random_state = 10)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


for i in range(3):
    tree = rf.estimators_[i]
    
    plt.figure(figsize=(10,8))
    plot_tree(tree, feature_names=X_train.columns, class_names=['P.S.', 'U.S.'], filled=True, max_depth=4)
    
    if i == 1:
        plt.savefig('Numeric Features OOB.png')
    
    plt.show()

In [None]:
# Aggregate feature importances
importances = rf.feature_importances_

# Sort features by importance
sorted_indices = np.argsort(importances)[::-1]
sorted_importances = importances[sorted_indices]
sorted_features = X_train.columns[sorted_indices]

# Create bar plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_importances)), sorted_importances, align='center')
plt.xticks(range(len(sorted_importances)), sorted_features, rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importances OOB')
plt.tight_layout()
plt.savefig('Feature Importances OOB.png')
plt.show()

In [None]:
# Confusion Matrix
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis = 1)[:, np.newaxis]

plt.figure(figsize =  (10,5))
sns.set(font_scale = 1.4)
sns.heatmap(matrix, annot=True, annot_kws = {'size':10}, cmap = plt.cm.Greens, linewidths = 0.2)

class_names = ['P.S.', 'U.S.']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Numerical Features Random Forest Model (OOB)')
plt.savefig('Confusion Matrix for Numerical Features Random Forest Model OOB.png')
plt.show()

In [None]:
# Precision, Recall, and F1 Score
print(classification_report(y_test, y_pred))