In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# import data
df = pd.read_csv('sports_management_dataset.csv')
df.head(10)

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,Moderate,High,High,Moderate Engagement,Moderate,Moderate Impact,Moderate,Moderate,High Efficiency,Local,Community Development,Low,High Engagement,Moderate Efficiency,Health-Oriented
1,High,Moderate,High,Moderate Engagement,Low,Low Impact,Moderate,Low,Moderate Efficiency,National,Community Development,Moderate,Low Engagement,Moderate Efficiency,Recreational
2,High,High,High,Low Engagement,High,Moderate Impact,Moderate,Moderate,Moderate Efficiency,National,Community Development,High,Low Engagement,High Efficiency,Recreational
3,High,High,High,Moderate Engagement,Moderate,Moderate Impact,Moderate,Moderate,Moderate Efficiency,Regional,Community Development,High,Low Engagement,High Efficiency,Recreational
4,Moderate,High,Low,Low Engagement,Low,Moderate Impact,High,High,High Efficiency,Regional,Community Development,Low,Moderate Engagement,Moderate Efficiency,Community Development
5,Moderate,Moderate,High,Moderate Engagement,High,Low Impact,Moderate,Low,Moderate Efficiency,Regional,Community Development,High,Low Engagement,Moderate Efficiency,Community Development
6,Low,High,Moderate,Moderate Engagement,Low,Moderate Impact,Moderate,Moderate,High Efficiency,Regional,Youth-Focused,High,Moderate Engagement,Moderate Efficiency,Youth-Focused
7,High,High,Moderate,High Engagement,Low,High Impact,Low,Low,High Efficiency,National,Community Development,Moderate,High Engagement,Low Efficiency,Recreational
8,High,High,Moderate,High Engagement,Moderate,High Impact,Low,Low,Moderate Efficiency,Local,Community Development,Moderate,Low Engagement,Low Efficiency,Youth-Focused
9,High,High,High,Moderate Engagement,High,Moderate Impact,Low,Low,High Efficiency,Local,Recreational,Low,Moderate Engagement,Moderate Efficiency,Community Development


In [3]:
# get an overview of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102000 entries, 0 to 101999
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   Energy Consumption           102000 non-null  object
 1   Carbon Emissions             102000 non-null  object
 2   Waste Generation             102000 non-null  object
 3   Community Engagement         102000 non-null  object
 4   Volunteer Participation      102000 non-null  object
 5   Health Impact                102000 non-null  object
 6   Water Usage                  102000 non-null  object
 7   Material Recycling Rate      102000 non-null  object
 8   Operational Cost Efficiency  102000 non-null  object
 9   Event Scale                  102000 non-null  object
 10  Event Focus                  102000 non-null  object
 11  Sustainability Score         102000 non-null  object
 12  Social Impact Level          102000 non-null  object
 13  Resource Effic

In [4]:
# count the number of values and percentage of total in Sustainability Score column
df['Sustainability Score'].value_counts(normalize=True)

Sustainability Score
High        0.497029
Moderate    0.302853
Low         0.200118
Name: proportion, dtype: float64

In [5]:
# check for number of unique values for each column to get an idea of how to encode the data
df.nunique()

Energy Consumption             3
Carbon Emissions               3
Waste Generation               3
Community Engagement           3
Volunteer Participation        3
Health Impact                  3
Water Usage                    3
Material Recycling Rate        3
Operational Cost Efficiency    3
Event Scale                    3
Event Focus                    4
Sustainability Score           3
Social Impact Level            3
Resource Efficiency            3
Event Type Classification      4
dtype: int64

In [6]:
df['Energy Consumption'].str.split().str[0]

0         Moderate
1             High
2             High
3             High
4         Moderate
            ...   
101995        High
101996    Moderate
101997    Moderate
101998         Low
101999        High
Name: Energy Consumption, Length: 102000, dtype: object

In [7]:
# create a list of columns to change
cols = ['Community Engagement', 'Health Impact', 'Operational Cost Efficiency', 'Social Impact Level', 'Resource Efficiency']

# loop through the columns and remove the second word
for col in cols:
    df[col] = df[col].str.split().str[0]

df.head()

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,Moderate,High,High,Moderate,Moderate,Moderate,Moderate,Moderate,High,Local,Community Development,Low,High,Moderate,Health-Oriented
1,High,Moderate,High,Moderate,Low,Low,Moderate,Low,Moderate,National,Community Development,Moderate,Low,Moderate,Recreational
2,High,High,High,Low,High,Moderate,Moderate,Moderate,Moderate,National,Community Development,High,Low,High,Recreational
3,High,High,High,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate,Regional,Community Development,High,Low,High,Recreational
4,Moderate,High,Low,Low,Low,Moderate,High,High,High,Regional,Community Development,Low,Moderate,Moderate,Community Development


In [8]:
# create a list of columns to encode
encode_cols = ['Energy Consumption', 'Carbon Emissions', 'Waste Generation', 'Community Engagement', 
               'Volunteer Participation', 'Health Impact', 'Water Usage', 'Material Recycling Rate', 
               'Operational Cost Efficiency', 'Sustainability Score' ,'Social Impact Level', 'Resource Efficiency']

# create a loop to encode the data
ordinal_encoder = OrdinalEncoder(categories=[['Low', 'Moderate', 'High']])

for col in encode_cols:
    df[col] = ordinal_encoder.fit_transform(df[[col]])

df.head(10)


Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,Local,Community Development,0.0,2.0,1.0,Health-Oriented
1,2.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,National,Community Development,1.0,0.0,1.0,Recreational
2,2.0,2.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,National,Community Development,2.0,0.0,2.0,Recreational
3,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,Regional,Community Development,2.0,0.0,2.0,Recreational
4,1.0,2.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,Regional,Community Development,0.0,1.0,1.0,Community Development
5,1.0,1.0,2.0,1.0,2.0,0.0,1.0,0.0,1.0,Regional,Community Development,2.0,0.0,1.0,Community Development
6,0.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,Regional,Youth-Focused,2.0,1.0,1.0,Youth-Focused
7,2.0,2.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,National,Community Development,1.0,2.0,0.0,Recreational
8,2.0,2.0,1.0,2.0,1.0,2.0,0.0,0.0,1.0,Local,Community Development,1.0,0.0,0.0,Youth-Focused
9,2.0,2.0,2.0,1.0,2.0,1.0,0.0,0.0,2.0,Local,Recreational,0.0,1.0,1.0,Community Development


In [None]:
#NEW CODE

In [None]:
#Module 12 Regression

In [None]:
#12.1.5 Linear Regression Model
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Read the electricity generation data
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_1/datasets/electricity-generation.csv"
df_electricity = pd.read_csv(file_path)

# Display sample data
df_electricity.head()

# Create a scatter plot with the total electricity generation by year
electricity_plot = df_electricity.plot.scatter(
x="Year",
y="Total",
title="Total electricity generation by year (GHz)"
)
electricity_plot

## Prepare the Data to Fit the Linear Regression Model
# Create the X set
X = df["Year"].values.reshape(-1, 1)

# Display sample data
X[:5]

# Create an array for the dependent variable y with the total electricity generation data
y = df["Total"]

## Build the Linear Regression Model
# Make predictions using the X set
predicted_y_values = model.predict(X)

# Create a copy of the original data
df_electricity_predicted = df.copy()

# Add a column with the predicted electricity values
df_electricity_predicted["electricity_predicted"] = predicted_y_values

# Display sample data
df_electricity_predicted.head()

# Create a line plot of the predicted total electricity generation values
best_fit_line = df_electricity_predicted.plot.line(
x = "Year",
y = "electricity_predicted",
color = "red")
best_fit_line

# Superpose the original data and the best fit line
#Create a scatter plot with the electricity information
electricity_plot = df_electricity_predicted.plot.scatter(
x="Year",
y="Total",
title="Electricity Generation by Year (GHz)")

# Create a line plot of the predicted total electricity generation values
best_fit_line = df_electricity_predicted.plot.line(
    x = "Year",
    y = "electricity_predicted",
    color = "red",
    ax=electricity_plot)
electricity_plot

## Make Manual Predictions
# Display the formula to predict the electricity generation for 2023
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 2023")

# Predict the electricity generation for 2023
y_2023 = model.intercept_ + model.coef_[0] * 2023

# Display the prediction
print(f"Predicted electricity generation for 2023: {y_2023:.2f}").

## Make Predictions Using the `predict` Function
# Create an array to predict electricity generation for the years 2020, 2021, 2022, and 2023
X_years = np.array([2020, 2021, 2022, 2023])

# Format the array as a one-column array
X_years = X_years.reshape(-1,1)

# Display sample data
X_years

# Predict electricity generation for the years 2020, 2021, 2022, and 2023
predicted_electricity = model.predict(X_years).

# Create a DataFrame for the predicted electricity generation
df_predicted_electricity = pd.DataFrame(
    {"Year": X_years.reshape(1, -1)[0],
        "predicted_electricity": predicted_electricity})

# Display data
df_predicted_electricity

## Linear Regression Model Assessment
# Import relevant metrics - score, r2, mse, rmse - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

# Compute the metrics for the linear regression model
score = round(model.score(X, y, sample_weight=None),5)
r2 = round(r2_score(y, predicted_y_values),5)
mse = round(mean_squared_error(y, predicted_y_values),4)
rmse = round(np.sqrt(mse),4)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")

In [None]:
#12.1.6 Encoding
import pandas as pd

# Import the data
# Note: NA values in this dataset are represented as "?"
car_data = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_1/datasets/car-data.csv", na_values="?")
car_data

# Check dtypes
car_data.dtypes

# Object features should be converted to numbers.
# num-of-doors and num-of-cylinders are both numbers written as text.
# Create a dictionary of the text and integers that should be converted
str_to_int = {"eight": 8, 
              "five": 5,
              "four": 4,
              "six": 6,
              "three": 3,
              "twelve": 12,
              "two": 2}

# Fix the columns using the Pandas replace() method
car_data[["num-of-doors","num-of-cylinders"]] = car_data[["num-of-doors","num-of-cylinders"]].replace(str_to_int, regex=False)
car_data

# Check dtypes
car_data.dtypes

## Pandas encoding methods
# Encode using pd.get_dummies()
car_data_dummies = pd.get_dummies(car_data)
car_data_dummies.head()

# Check column names
car_data_dummies.columns

# Use Pandas .astype("category").cat.codes for single column category encoding
columns_to_encode = ["make",
                     "fuel-type",
                     "aspiration",
                     "body-style",
                     "drive-wheels",
                     "engine-location",
                     "engine-type",
                     "fuel-system"]

# Copy car_data
car_data_cat_codes = car_data.copy()

# Loop through columns_to_encode and convert the columns to category codes
for column in columns_to_encode:
    car_data_cat_codes[column] = car_data_cat_codes[column].astype("category").cat.codes

car_data_cat_codes.head()

# Check dtypes
car_data_cat_codes.dtypes

## Scikit-learn encoding methods
# OneHotEncoder
# Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
from sklearn.preprocessing import OneHotEncoder

# Create an instance of OneHotEncoder()
enc = OneHotEncoder(handle_unknown='ignore')

# Fit the encoder to the data
enc.fit(car_data[columns_to_encode])

# Transform the data
car_data_ohe = enc.transform(car_data[columns_to_encode])

# Default output is sparse matrix
car_data_ohe

# Get new feature names
enc.get_feature_names_out()

# Set up the OneHotEncoder so it will transform to Pandas
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.set_output(transform="pandas")

# Fit and transform the OneHotEncoder to the columns to encode
car_data_ohe = ohe.fit_transform(car_data[columns_to_encode])
car_data_ohe.head()

# LabelEncoder
# Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.preprocessing import LabelEncoder

# Create an instance of the label encoder
le = LabelEncoder()

# Copy car_data
car_data_label_encoded = car_data.copy()

# Fit and transform the label encoder for each column
for column in columns_to_encode:
car_data_label_encoded[column] = le.fit_transform(car_data_label_encoded[column])

car_data_label_encoded.head()

# Check dtypes
car_data_label_encoded.dtypes

In [None]:
#12.1.9 Predict
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Read the CSV file into a Pandas DataFrame
Lp100km = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_1/datasets/liters-per-100km.csv')
Lp100km.head()

## Visualize the Data to Find Any Linear Trends
# Plot the cylinders & L/100km to find out if a linear trend exists
Lp100km.plot.scatter(x='cylinders', y='L/100km')

# Plot the displacement & L/100km to find out if a linear trend exists
Lp100km.plot.scatter(x='displacement', y='L/100km') 

# Assign the variable X to the two features that appear to have the most linear relationship with L/100km
# Note: scikit-learn requires a two-dimensional array of values
# so we use reshape() to create this

X = Lp100km[["weight (kg)", "displacement"]].values.reshape(-1, 2)
y = Lp100km["L/100km"].values.reshape(-1, 1)

print("Shape: ", X.shape, y.shape)

# Use the Sklearn `train_test_split()` function to split the data into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create the model
model = LinearRegression()

# Fit the model to the training data. 
model.fit(X_train, y_train)

# Calculate the mean_squared_error and the r-squared value
# for the testing data

from sklearn.metrics import mean_squared_error, r2_score

# Use our model to make predictions
predicted = model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

print(f"mean squared error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# Call the `score()` method on the model to show the R2 score
model.score(X_test, y_test)

In [None]:
#12.3.1 Regression
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Import the data
df = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_3/datasets/rent-data-label-encoded.csv")
df.head()

# Drop rows with missing values
df = df.dropna()

## Split into training and testing sets
# Make an X variable with all columns except price
X_full = df.drop(columns = ['price'])
X_full.columns

select_features = ["square_feet", "Gated", "bathrooms", "bedrooms", "has_photo", "Pool", "AC"]

# Create another variable X_sel with only the columns
# in the "select_features" list

X_sel = df[select_features]
X_sel.head()

# Set the target variable y
y = df["price"].values.reshape(-1, 1)

# Now split the data into training and testing sets
X_full_train, X_full_test, X_sel_train, X_sel_test, y_train, y_test = train_test_split(X_full, X_sel, y, random_state=42)

## Train the models
# Create the models
lr1 = LinearRegression()
lr2 = LinearRegression()

# Fit the first model to the full training data. 
lr1.fit(X_full_train, y_train)

# Fit the second model to the select training data.
lr2.fit(X_sel_train, y_train)

## Evaluate the model
# Calculate the mean_squared_error and the r-squared value
# for the testing data

# Use our models to make predictions
predicted1 = lr1.predict(X_full_test)
predicted2 = lr2.predict(X_sel_test)

# Score the predictions with mse and r2
mse1 = mean_squared_error(y_test, predicted1)
r21 = r2_score(y_test, predicted1)
mse2 = mean_squared_error(y_test, predicted2)
r22 = r2_score(y_test, predicted2)

print(f"All Features:")
print(f"mean squared error (MSE): {mse1}")
print(f"R-squared (R2): {r21}")
print("---------------------")
print(f"Select Features:")
print(f"mean squared error (MSE): {mse2}")
print(f"R-squared (R2): {r22}")

# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

# Calculate the adjusted r-squared value of the model
adj_score1 = r2_adj(X_full_test, y_test, lr1)
adj_score2 = r2_adj(X_sel_test, y_test, lr2)
print(f"All Features Adjusted R2: {adj_score1}")
print(f"Select Features Adjusted R2: {adj_score2}")

# Examine linear regression on the better training data using cross validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(LinearRegression(), X_full_train, y_train, scoring = "r2")
print(f"All scores: {cv_scores}")
print(f"Mean score: {cv_scores.mean()}")
print(f"Standard Deviation: {cv_scores.std()}")

In [None]:
#12.2.7 Ridge and Lasso regression model
# Ridge - A ridge regression model is a type of linear regression model that addresses the issue of multicollinearity 
#   (highly correlated independent variables) by adding a penalty term to the regression equation, effectively shrinking 
#   the coefficients towards zero and stabilizing the model, preventing overfitting; it's considered a form of regularization, 
#   specifically known as L2 regularization.
#   MSE tells you how well the model is performing on average, while simultaneously performing feature selection by shrinking 
#   someells you how well the model is performing by measuring the average squared difference between its predicted values and 
#   the actual values
#   A good mse score is a low value, close to zero.
# Lasso - Lasso regression, or Least Absolute Shrinkage and Selection Operator, is a regularization technique that improves the 
#   accuracy of statistical models by preventing overfitting.
#   MSE tells you how well the model is performing on average, while simultaneously performing feature selection by shrinking some 
#   coefficients to zero, making it useful for understanding which variables are most important in the prediction process. 
#   A good mse score is a low value, close to zero.

from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import pandas as pd

df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_2/datasets/real-estate-evaluation.csv')
df.head()

# Separate the data into features and target 
X = df.drop('Y house price of unit area', axis=1)
y = df['Y house price of unit area']

# Check the features shape 
X.shape

### Perform ridge regression
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale the training data
scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)

# Create and train the model
model = Ridge(alpha=1)
model.fit(X_train_transformed, y_train)

# Scale the testing data and create predictions
X_test_transformed = scaler.transform(X_test)
y_predicted = model.predict(X_test_transformed)

# Assess the MSE
mean_squared_error(y_test, y_predicted)

# Use RidgeCV to optimize for alpha
from sklearn.linear_model import RidgeCV
model_cv = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10])
model_cv = model_cv.fit(X_train_transformed, y_train)

# Identify the optimzied alpha value
model_cv.alpha_

# Compare performance with a linear regression model
# Create and train a linear regression model, create predictions with the model, and evaluate its MSE
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train_transformed, y_train)
y_predicted_lr = lr_model.predict(X_test_transformed)
mean_squared_error(y_test, y_predicted_lr)

### Lasso regression
from sklearn.linear_model import Lasso

# Create and train a lasso regression model
lasso_model = Lasso(alpha=1)
lasso_model.fit(X_train_transformed, y_train)

# Get the model coeffcients
lasso_model.coef_

# Create predictions with the model
y_predicted_lasso = lasso_model.predict(X_test_transformed)

### Assess the lasso regression MSE and compare to ridge regression
# Evaluate the MSE
mean_squared_error(y_test, y_predicted_lasso)

In [None]:
## Do we want to create a pipeline? If so, see 12.3.3

In [None]:
#Module 13 Classification

In [None]:
#13.1.2 Logistics Regression Model
## Prepare the Data
# Import the required modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# Read in the app-data.csv file into a Pandas DataFrame.
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_1/datasets/app-data.csv"
app_data = pd.read_csv(file_path)

# Review the DataFrame
app_data.head()

# The column 'Result' is the thing you want to predict. 
# Class 0 indicates a benign app and class 1 indicates a malware app
# Using value_counts, how many malware apps are in this dataset?
app_data["Result"].value_counts()

## Split the data into training and testing sets
# Import Module
from sklearn.model_selection import train_test_split
# The target column `y` should be the binary `Result` column.
y = app_data["Result"]

# The `X` should be all of the features. 
X = app_data.copy()
X = X.drop(columns="Result")

# Split the dataset using the train_test_split function
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Model and Fit the Data to a Logistic Regression
# Import `LogisticRegression` from sklearn
from sklearn.linear_model import LogisticRegression

# Declare a logistic regression model.
# Apply a random_state of 7 and max_iter of 120 to the model
logistic_regression_model = LogisticRegression(random_state=7, max_iter=120)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

# Validate the model
print(f"Training Data Score: {lr_model.score(X_train, y_train)}")
print(f"Testing Data Score: {lr_model.score(X_test, y_test)}")

## Predict the Testing Labels
# Generate predictions from the model we just fit
predictions = logistic_regression_model.predict(X_train)

# Convert those predictions (and actual values) to a DataFrame
results_df = pd.DataFrame({"Prediction": predictions, "Actual": y_train})
results_df

# Make and save testing predictions with the saved logistic regression model using the test data
testing_predections = lr_model.predict(X_test)

# Review the predictions
testing_predections

## Calculate the Performance Metrics
# Import the accuracy_score function
from sklearn.metrics import accuracy_score
    
# Display the accuracy score for the test dataset.
accuracy_score(y_test, testing_predections)

In [None]:
#13.1.6 SVM Model
# Import required dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC 

# Import data
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_1/datasets/app-data.csv"
df = pd.read_csv(file_path)
df.head()

## Split the data into training and testing sets
# Get the target variable (the "Result" column)
y = df["Result"]

# Get the features (everything except the "Result" column)
X = df.copy()
X = X.drop(columns="Result")
X.head()

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Model and Fit to a Support Vector Machine
# Create the support vector machine classifier model with a 'linear' kernel
model = SVC(kernel='linear')
    
# Fit the model to the training data
model.fit(X_train, y_train)

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % model.score(X_train, y_train))
print('Test Accuracy: %.3f' % model.score(X_test, y_test))

## Predict the Testing Labels
# Make and save testing predictions with the saved SVM model using the testing data
testing_predictions = model.predict(X_test)

# Review the predictions
testing_predictions

## Evaluate the Model
# Display the accuracy score for the testing dataset
accuracy_score(y_test, testing_predictions)

In [None]:
#13.2.2 KNN Model
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Define features set
X = app_data.copy()
X.drop("Result", axis=1, inplace=True)
X.head()

# Define target vector
y = app_data["Result"]
y.head()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

# Create a StandardScaler() model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data by using the X_scaler model
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# K-nearest neighbors
# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
# Plot the results
plt.plot(range(1, 20, 2), train_scores, marker='o', label="training scores")
plt.plot(range(1, 20, 2), test_scores, marker="x", label="testing scores")
plt.xlabel("k neighbors")
plt.ylabel("accuracy score")
plt.legend()
plt.show()

# Choose the best k, and refit the KNN classifier by using that k value.
# Note that k: 9 provides the best accuracy where the classifier starts to stablize
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)

# Print the score for the test data.
print('k=9 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

In [None]:
#13.2.4 Decision Tree Model
# Initial imports
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

# *Skipped loading data

# Define features set
X = app_data.copy()
X.drop("Result", axis=1, inplace=True)
X.head()

# Define target vector
y = app_data["Result"].values.reshape(-1, 1)
y[:5]

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Decision Tree Model
# Create the decision tree classifier instance
model = tree.DecisionTreeClassifier()

# Fit the model
model = model.fit(X_train_scaled, y_train)

## Making Predictions Using the Tree Model
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

## Model Evaluation
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

print(f"Accuracy Score : {acc_score}")

## Visualizing the Decision Tree
# Create DOT data
dot_data = tree.export_graphviz(
    model, out_file=None, feature_names=X.columns, class_names=["0", "1"], filled=True, max_depth=5
)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png())

# When saving the image, graph.write_<file_type>() must take a string object

# Save the tree as PDF
file_path = "malware_tree.pdf"
graph.write_pdf(file_path)

# Save the tree as PNG
file_path = "malware_tree.png"
graph.write_png(file_path)

In [None]:
#13.2.5 & 13.2.6 Random Forest
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# *Skipped loading data

## Loading and Preprocessing Malware Apps Data
# Define features set
X = df_apps.copy()
X.drop("Result", axis=1, inplace=True)
X.head()

# Define target set
y = df_apps["Result"].ravel()
y[:5]

## Fitting the Random Forest Model
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Random Forest model
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

# Get the feature importance array
feature_importances = clf.feature_importances_

# List the top 10 most important features
importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
importances_sorted[:10]

# Plot the feature importances
features = sorted(zip(X.columns, feature_importances), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(8,6)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

## Making Predictions Using the Random Forest Model
# Make predictions using the testing data
predictions = rf_model.predict(X_test)

## Model Evaluation
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

## Analysis Questions

Finally, analyze the model's evaluation results and answer the following questions.

* **Question:** Would you trust this model to detect malware? 

    * **Sample Answer:** Yes. The model's accuracy is good at predicting malware because of the high accuracy. 

* **Question:** Out of the following models, which one had the highest accuracy score: logistic regression, SVM, 
KNN, decision tree, or random forest?

    * **Sample Answer:** Random forest performed marginally better (about 0.007) than the other models, which all 
performed in a similar range. Other performance metrics should be calculated to determine the best model.

In [None]:
#13.2.7 Extra Trees Model, Gradient Boosting Model and Adaptive Boosting Model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

# Read the forest cover dataset
df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_2/datasets/covtype.csv')

# Split the features and target
X = df.drop('cover', axis=1)
y = df['cover']
target_names = ["Spruce/Fir", "Lodgepole Pine"]

# Prepare the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the ExtraTreesClassifier model
clf = ExtraTreesClassifier(random_state=1).fit(X_train_scaled, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

# Train the Gradient Boosting classifier
clf = GradientBoostingClassifier(random_state=1).fit(X_train_scaled, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

# Train the AdaBoostClassifier
clf = AdaBoostClassifier(random_state=1).fit(X_train_scaled, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
#13.3.1 Runs through multiple models
# Import required dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import data
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_3/datasets/tic-tac-toe.csv"
df = pd.read_csv(file_path)
df.head()

## Preprocess the data
# Check the data types
df.dtypes

# Get the target variable (the "Class" column)
# Since the target column is an object, we need to convert the data to numerical classes
# Use the LabelEncoder

# Create an instance of the label encoder
le = LabelEncoder()
y = le.fit_transform(df["Class"])
y

# Get the features (everything except the "Class" column)
X = df.copy()
X = X.drop(columns="Class")
X.head()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Remember that all of the columns in the DataFrame are objects
# Use a OneHotEncoder to convert the training and testing data to numerical values
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype='int')
X_train_encoded = pd.DataFrame(data=ohe.fit_transform(X_train), columns=ohe.get_feature_names_out())
X_test_encoded = pd.DataFrame(data=ohe.transform(X_test), columns=ohe.get_feature_names_out())
X_train_encoded

## Model and Fit to a Logistic Regression Classifier
# Create the logistic regression classifier model with a random_state of 1
lr_model = LogisticRegression(random_state=1)

# Fit the model to the training data
lr_model.fit(X_train_encoded, y_train)

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % lr_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % lr_model.score(X_test_encoded, y_test))

## Model and Fit to a Support Vector Machine
# Create the support vector machine classifier model with a 'linear' kernel
svm_model = SVC(kernel='linear')

# Fit the model to the training data
svm_model.fit(X_train_encoded, y_train)

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svm_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % svm_model.score(X_test_encoded, y_test))

## Model and Fit to a KNN model
# Create the KNN model with 5 neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)

# Fit the model to the training data
knn_model.fit(X_train_encoded, y_train)

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % knn_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % knn_model.score(X_test_encoded, y_test))

## Model and Fit to a Decision Tree Classifier
# Create the decision tree classifier model
dt_model = DecisionTreeClassifier()

# Fit the model to the training data
dt_model.fit(X_train_encoded, y_train)

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % dt_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % dt_model.score(X_test_encoded, y_test))

## Model and Fit to a Random Forest Classifier
# Create the random forest classifier model
# with n_estimators=128 and random_state=1
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

# Fit the model to the training data
rf_model.fit(X_train_encoded, y_train)

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % rf_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % rf_model.score(X_test_encoded, y_test))


In [None]:
#13.3.2 Irish Decision Tree
from sklearn import tree
from sklearn.datasets import load_iris

import pydotplus
from IPython.display import Image

# Load the Iris dataset
iris = load_iris()

# Create and score a decision tree classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
clf.score(iris.data, iris.target)

# Create a decision tree graph
dot_data = tree.export_graphviz(
    clf, out_file=None, 
    feature_names=iris.feature_names,  
    class_names=iris.target_names,  
    filled=True, rounded=True,  
    special_characters=True)  

graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('iris.png')

# Show graph
Image(graph.create_png())

In [None]:
#13.3.4 Regressors
%matplotlib widget
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_regression, make_swiss_roll
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')

# Create data
X, y = make_regression(random_state=1)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
data = [X_train_scaled, X_test_scaled, y_train, y_test]

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR

test_model(LinearRegression(), data)
test_model(KNeighborsRegressor(), data)
test_model(RandomForestRegressor(), data)
test_model(ExtraTreesRegressor(), data)
test_model(AdaBoostRegressor(), data)
test_model(SVR(C=1.0, epsilon=0.2), data)

# Create data
X, y = make_swiss_roll(random_state=1, n_samples=500, noise=1)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
data = [X_train_scaled, X_test_scaled, y_train, y_test]

X_train_scaled

# Plot the result (requires matplotlib>=3.9)
ax = plt.figure().add_subplot(projection='3d')
ax.view_init(7, -80)
ax.scatter(X[0], X[1], X[2],
           color=plt.cm.jet(y/y.max()),
           s=20, edgecolor='k')
plt.savefig("swiss_roll.png")
plt.show()

test_model(LinearRegression(), data)
test_model(KNeighborsRegressor(), data)
test_model(RandomForestRegressor(), data)
test_model(ExtraTreesRegressor(), data)
test_model(AdaBoostRegressor(), data)
test_model(SVR(C=1.0, epsilon=0.2), data)

In [None]:
#14.1.4 Metrics
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, roc_auc_score

# Import the data
df = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m14/lesson_1/datasets/crowdfunding-data-imbalanced.csv")
df.head()

# Show the total number of positive and negative outcomes
df['outcome'].value_counts()

# Create an X and y variable
X = df.drop(columns=['outcome'])
y = df['outcome']

# Create a Logistic Regression Model
classifier = LogisticRegression()

# Fit the model to the training data
classifier.fit(X, y)

# Calculate the accuracy of the model
classifier.score(X, y)

# Make predictions on the test data
predictions = classifier.predict(X)

# Create a confusion matrix
print(confusion_matrix(y, predictions, labels = [1,0]))

# Create a classification report
print(classification_report(y, predictions, labels = [1, 0]))

# Calculate the balanced accuracy score
print(balanced_accuracy_score(y, predictions))

# Predict values with probabilities
pred_probas = classifier.predict_proba(X)

# Print the probabilities
pred_probas

# Each prediction includes a prediction for both the 0 class and the 1 class
# We only need the predictions for the 1 class; use a list comprehension to 
# gather the second value from each list

pred_probas_firsts = [prob[1] for prob in pred_probas]

# Print the first 5 probabilities
pred_probas_firsts[0:5]

# Calculate the roc_auc_score
print(roc_auc_score(y, pred_probas_firsts))

In [None]:
#14.1.7 Overfitting (another example is 14.3.1 to show the max depth if overfitted)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Import the data
df = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_2/datasets/crowdfunding-data.csv")
df.info()

# Create an X and y variable
X = df.drop(columns=['outcome'])
y = df['outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Create a Random Forest model
classifier = RandomForestClassifier()

# Fit (train) or model using the training data
classifier.fit(X_train, y_train)

# Check the model's balanced accuracy on the test set
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

# Check the model's balanced accuracy on the training set
y_train_pred = model.predict(X_train)
print(balanced_accuracy_score(y_train, y_train_pred))

# Calculate the accuracy of the model on the testing data
classifier.score(X_test, y_test)

# Calculate the accuracy of the model on the training data
classifier.score(X_train, y_train)

# Create a loop to vary the max_depth parameter
# Make sure to record the train and test scores 
# for each pass.

# Depths should span from 1 up to 15 in steps of 1
depths = range(1, 15)

# The scores dataframe will hold depths and scores
# to make plotting easy
scores = {'train': [], 'test': [], 'depth': []}

# Loop through each depth
for depth in depths:
    clf = RandomForestClassifier(max_depth=depth)
    clf.fit(X_train, y_train)

    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)

    scores['depth'].append(depth)
    scores['train'].append(train_score)
    scores['test'].append(test_score)

# Create a dataframe from the scores dictionary and
# set the index to depth
scores_df = pd.DataFrame(scores).set_index('depth')

# Plot the scores dataframe with the plot method
scores_df.plot()

# Fit the model with the best max_depth
clf = RandomForestClassifier(max_depth=6, random_state=42)
clf.fit(X_train, y_train)

train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

# Print the train and test balanced accuracy scores
print(balanced_accuracy_score(y_train, train_pred))
print(balanced_accuracy_score(y_test, test_pred))

In [None]:
#14.2.3 Data Leakage and Correlation
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Import the data
df = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m14/lesson_2/datasets/crowdfunding-data-leakage.csv")

# Create an X and y variable
X = df.drop(columns=['outcome'])
y = df['outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13)

# Create a Random Forest Model
classifier = RandomForestClassifier(random_state=13)

# Fit (train) or model using the training data
classifier.fit(X_train, y_train)

# Calculate the accuracy of the model with training data
classifier.score(X_train, y_train)

# Calculate the accuracy of the model with testing data
classifier.score(X_test, y_test)

# Identify any columns that could be leaking data
df.head()

# Check correlation of columns to the outcome column
df.corr()['outcome'].sort_values()

# Plot rewards_given and outcome in a scatter plot
df.plot(kind='scatter', x='rewards_given', y='outcome')

In [None]:
#14.2.5 Encoding
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

# Import the data
df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m14/lesson_2/datasets/text-data.csv')
df.head()

# Create X and y and split into training and testing sets
X = df.drop(columns='arrived')
y = df['arrived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

# Decide how to encode the backpack_color column
X_train['backpack_color'].value_counts()

# Create an encoder for the backpack_color column
backpack_color_ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

# Train the encoder
backpack_color_ohe.fit(X_train['backpack_color'].values.reshape(-1,1))

# Decide how to encode the grade column
df['grade'].value_counts()

# Create an encoder for the backpack_color column
grade_ord_enc = OrdinalEncoder(categories = [['F', 'D', 'C', 'B', 'A']], encoded_missing_value=-1, handle_unknown='use_encoded_value', unknown_value=-1)

# Train the encoder
grade_ord_enc.fit(X_train['grade'].values.reshape(-1,1))

# Decide how to encode the favorite_creature column
df['favorite_creature'].value_counts()

# Create an encoder for the backpack_color column
creature_ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False, min_frequency=0.2)

# Train the encoder
creature_ohe.fit(X_train['favorite_creature'].values.reshape(-1,1))

# Create a function using the pretrained encoders to use on
# any new data (including the testing data)

def X_preprocess(X_data):
    # Transform each column into numpy arrays
    backpack_color_encoded = backpack_color_ohe.transform(X_data['backpack_color'].values.reshape(-1,1))
    grade_encoded = grade_ord_enc.transform(X_data['grade'].values.reshape(-1,1))
    favorite_creature_encoded = creature_ohe.transform(X_data['favorite_creature'].values.reshape(-1,1))

    # Reorganize the numpy arrays into a DataFrame
    backpack_color_df = pd.DataFrame(backpack_color_encoded, columns = backpack_color_ohe.get_feature_names_out())
    creature_df = pd.DataFrame(favorite_creature_encoded, columns= creature_ohe.get_feature_names_out())
    out_df = pd.concat([backpack_color_df, creature_df], axis = 1)
    out_df['grade'] = grade_encoded

    # Return the DataFrame
    return out_df

# Preprocess the training data
X_preprocess(X_train)

# Preprocess the testing data
X_preprocess(X_test)

In [None]:
#14.3.2 Hyperparameters

In [None]:
#14.3.4 Resampling
## Prepare the Data

# Import modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Read the CSV file into a Pandas DataFrame
bank_data_df = pd.read_csv('../Resources/bank.csv')

# Review the DataFrame
bank_data_df.head()

# Split the features and target data
y = bank_data_df['y']
X = bank_data_df.drop(columns='y')

# Encode the features dataset's categorical variables using get_dummies
X = pd.get_dummies(X)

# Review the features DataFrame
X.head()

# Split data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Review the distinct values from y
y_train.value_counts()

# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

## RandomForestClassifier
# Import the RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier instance
model = RandomForestClassifier()

# Fit the traning data to the model
model.fit(X_train_scaled, y_train)

# Predict labels for original scaled testing features
y_pred = model.predict(X_test_scaled)

## Random Undersampler
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

# Fit the training data to the random undersampler model
X_undersampled, y_undersampled = rus.fit_resample(X_train_scaled, y_train)

# Count distinct values for the resampled target data
y_undersampled.value_counts()

# Instantiate a new RandomForestClassier model
model_undersampled = RandomForestClassifier()

# Fit the undersampled data the new model
model_undersampled.fit(X_undersampled, y_undersampled)

# Predict labels for oversampled testing features
y_pred_undersampled = model_undersampled.predict(X_test_scaled)

# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))

## Random Oversampler
# Import RandomOverSampler from imblearn
from imblearn.over_sampling import RandomOverSampler

# Instantiate a RandomOversampler instance
ros = RandomOverSampler(random_state=1)

# Fit the training data to the `RandomOverSampler` model
X_oversampled, y_oversampled = ros.fit_resample(X_train_scaled, y_train)

# Count distinct values
y_oversampled.value_counts()

# Instantiate a new RandomForestClassier model
model_oversampled = RandomForestClassifier()

# Fit the oversampled data the new model
model_oversampled.fit(X_oversampled, y_oversampled)

# Predict labels for oversampled testing features
y_pred_oversampled = model_oversampled.predict(X_test_scaled)

# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))
print("---------")
print(f"Classification Report - Oversampled Data")
print(classification_report(y_test, y_pred_oversampled))

## Cluster Centroids
# Import ClusterCentroids from imblearn
from imblearn.under_sampling import ClusterCentroids

# Instantiate a ClusterCentroids instance
cc_sampler = ClusterCentroids(random_state=1)

# Fit the training data to the cluster centroids model
X_resampled, y_resampled = cc_sampler.fit_resample(X_train_scaled, y_train)

# Count distinct values for the resampled target data
y_resampled.value_counts()

# Instantiate a new RandomForestClassier model
cc_model = RandomForestClassifier()

# Fit the resampled data the new model
cc_model.fit(X_resampled, y_resampled)

# Predict labels for resampled testing features
cc_y_pred = cc_model.predict(X_test_scaled)

# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Resampled Data - CentroidClusters")
print(classification_report(y_test, cc_y_pred))

## SMOTE
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE instance 
# Set the sampling_strategy parameter equal to auto
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

# Fit the training data to the smote_sampler model
X_resampled, y_resampled = smote_sampler.fit_resample(X_train_scaled, y_train)

# Count distinct values for the resampled target data
y_resampled.value_counts()

# Instantiate a new RandomForestClassier model 
smote_model = RandomForestClassifier()

# Fit the resampled data to the new model
smote_model.fit(X_resampled, y_resampled)

# Predict labels for resampled testing features
smote_y_pred = smote_model.predict(X_test_scaled)

# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, smote_y_pred))

## SMOTEENN
# Import SMOTEEN from imblearn
from imblearn.combine import SMOTEENN

# Instantiate the SMOTEENN instance
smote_enn = SMOTEENN(random_state=1)

# Fit the model to the training data
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

# Instantiate a new RandomForestClassier model
smoteenn_model = RandomForestClassifier()

# Fit the resampled data the new model
smoteenn_model.fit(X_resampled, y_resampled)

# Predict labels for resampled testing features
smoteenn_y_pred = smoteenn_model.predict(X_test_scaled)

# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, smoteenn_y_pred))