In [None]:
# @title **Step 0:** Get Necessary Libraries
# Install required libraries
!pip install seaborn
!pip install wget pandas

# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import wget
import os
import shutil
from IPython.display import display, clear_output
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# Additional imports for data manipulation and analysis
from scipy.stats import zscore
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold



In [None]:
# @title **Step 1 :** Download From Github and Organize Data

# Function to download and organize data
def download_and_organize_data():
    # URLs of the files on GitHub
    url_concrete_data_yeh = "https://raw.githubusercontent.com/Black-randy/Concrete-Strength-Prediction/main/Concrete_Data_Yeh.csv"
    # Specify the destination paths in Google Colab
    destination_path_train = "/content/train_data/Concrete_Data_Yeh.csv"
    # Create folders for train data
    os.makedirs("/content/train_data", exist_ok=True)
    # Download the file
    wget.download(url_concrete_data_yeh, destination_path_train)
    # Clear the output
    clear_output()
    # Print a message indicating the completion of the download and organization
    print("Files downloaded and organized into train_data folder.")
# Download and organize data
download_and_organize_data()


In [None]:
# @title **Step 2 :** Load Data into Pandas DataFrames

# Function to load data into Pandas DataFrames
def load_data_into_dataframes(train_data_path):
    try:
        # Load CSV files into Pandas DataFrames
        train_df = pd.read_csv(os.path.join(train_data_path, "Concrete_Data_Yeh.csv"))

        # Clear previous outputs
        clear_output()

        # Print the assigned values and display a preview of the DataFrames
        print(f"train_data_path: {train_data_path}")

        # Return the DataFrame
        return train_df

    except FileNotFoundError:
        print(f"Error: File not found at {train_data_path}")
        return None
    except Exception as e:
        print(f"Error: An unexpected error occurred - {e}")
        return None

# Assign values to train_data_path
train_data_path = "/content/train_data"

# Load data into Pandas DataFrame
train_df = load_data_into_dataframes(train_data_path)


In [None]:
# @title **Step 2.1 :** Train DataFrame info

def display_dataframe_info(df, df_name="Train"):
    print(f"\n\033[1m\033[94mShape of the {df_name} DataFrame:\033[0m\n")
    rows_count, columns_count = df.shape
    print(f'\033[1mTotal Number of rows:\033[0m {rows_count}')
    print(f'\033[1mTotal Number of columns:\033[0m {columns_count}')

    # Display a preview of the DataFrame
    print(f"\n\033[1m\033[94mPreview of {df_name} DataFrame:\033[0m\n")
    display(df.head(4))

    # Display data types of each attribute in the DataFrame
    data_types = df.dtypes
    print(f"\n\033[1m\033[94mData Types of Attributes in {df_name} DataFrame:\033[0m\n")
    print(data_types)

    # Check for missing values in the DataFrame
    missing_values = df.isnull().sum()

    # Display the results
    print(f"\n\033[1m\033[91mMissing Values in {df_name} DataFrame:\033[0m\n")
    print(missing_values[missing_values > 0])

    # Display descriptive statistics for the DataFrame
    descriptive_stats = df.describe()

    # Print the results
    print(f"\n\033[1m\033[94mDescriptive Statistics for {df_name} DataFrame:\033[0m")
    display(descriptive_stats)

# Display information for the train DataFrame
display_dataframe_info(train_df, "Train")

# Display information for the test DataFrame
# display_dataframe_info(test_df, "Test")


In [None]:
# @title **Step 3 :** Visualize Feature Importances

# Constants
RANDOM_STATE = 42

def concatenate_dataframes(train_df, test_df=None):
    # Concatenate train and test data
    if test_df is not None:
        combined_df = pd.concat([train_df, test_df], ignore_index=True)
    else:
        combined_df = train_df.copy()

    return combined_df

def visualize_feature_importances(X, y, random_state=RANDOM_STATE):
    # Create a Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=random_state)

    # Fit the model
    rf_model.fit(X, y)

    # Get feature importances
    feature_importances = rf_model.feature_importances_

    # Create a DataFrame to hold feature names and their importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sort features by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Plot the feature importances
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importances')
    plt.show()

# Concatenate data
combined_df = concatenate_dataframes(train_df)  # Modify if test_df is needed

# Separate features and target variable
X = combined_df.drop(columns=['csMPa'])
y = combined_df['csMPa']

# Visualize feature importances
visualize_feature_importances(X, y)


In [None]:
# @title Pair Plot of All Columns

#Set the style of seaborn
# sns.set(style="ticks")

#Create a pair plot for all columns
# plt.figure(figsize=(18, 15))
# sns.pairplot(combined_df, markers="h", diag_kind='kde')
# plt.suptitle("Pair Plot of All Columns", y=1.02, fontsize=20)
# plt.show()


In [None]:
# @title Visualize Outliers

def visualize_outliers(data, columns_of_interest, palette="Set3"):
    # Set the style of seaborn
    sns.set(style="whitegrid")

    # Create a horizontal box plot for each column to visualize outliers
    plt.figure(figsize=(8, 5))
    sns.boxplot(data=data[columns_of_interest], orient="h", palette=palette, dodge=False)
    plt.title('Horizontal Box Plot of Features to Visualize Outliers')
    plt.show()

# Select relevant columns for visualization
columns_of_interest = ['cement', 'slag', 'flyash', 'water', 'superplasticizer', 'coarseaggregate', 'fineaggregate', 'age', 'csMPa']

# Visualize outliers using "Set3" palette
visualize_outliers(combined_df, columns_of_interest, palette="Set3")


# **Outliers Visualization**
---

In [None]:
'''
# @title Outliers Visualization
features_of_interest = ['cement', 'slag', 'flyash', 'water', 'superplasticizer', 'coarseaggregate', 'fineaggregate', 'age', 'csMPa']

# Set the style of seaborn
sns.set(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(6,6))

# Flatten the axes for easy iteration
axes = axes.flatten()

# Loop through each feature and create histograms and boxplots
for i, feature in enumerate(features_of_interest):
    sns.histplot(train_df[feature], kde=True, ax=axes[i])
    axes[i].set_xlabel(feature, fontsize=8)
    axes[i].set_title(f"{feature} Distribution Plot", fontsize=8)

# Adjust layout
plt.tight_layout()
plt.show()

# Box plots for each feature
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(6,6))

# Flatten the axes for easy iteration
axes = axes.flatten()

# Loop through each feature and create boxplots
for i, feature in enumerate(features_of_interest):
    sns.boxplot(train_df[feature], ax=axes[i])
    axes[i].set_xlabel(feature, fontsize=8)
    axes[i].set_title(f"{feature} Box Plot", fontsize=8)

# Adjust layout
plt.tight_layout()
plt.show()
'''

In [None]:
# @title Cement Outliers Visualization
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(13, 5))
fig.set_size_inches(10, 3)
# Distribution Plot
sns.histplot(train_df['cement'], ax=ax1, kde=True)
ax1.tick_params(labelsize=12)
ax1.set_xlabel('Cement', fontsize=12)
ax1.set_title("Distribution Plot", fontsize=15)

# Box Plot
sns.boxplot(train_df['cement'], ax=ax2)
ax2.set_title("Box Plot", fontsize=15)
ax2.set_xlabel('Cement', fontsize=12)

plt.show()


In [None]:
# @title Slag Outliers Visualization
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
fig.set_size_inches(12,5)

# Distribution Plot
sns.histplot(train_df['slag'], ax=ax1, kde=True)
ax1.set_xlabel('Slag', fontsize=12)  # Adjusted font size
ax1.set_title("Distribution Plot", fontsize=15)

# Box Plot
sns.boxplot(train_df['slag'], ax=ax2)
ax2.set_xlabel('Slag', fontsize=12)  # Adjusted font size
ax2.set_title("Box Plot", fontsize=15)

plt.show()


In [None]:
# @title  Flyash Outliers Visualization
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
fig.set_size_inches(12,5)
sns.histplot(train_df['flyash'], ax=ax1, kde=True)
ax1.set_xlabel('Flyash', fontsize=15)
ax1.set_title("Distribution Plot")

sns.boxplot(train_df['flyash'], ax=ax2)
ax2.set_xlabel('Flyash', fontsize=15)
ax2.set_title("Box Plot")


In [None]:
# @title  Water Outliers Visualization
# Water
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
fig.set_size_inches(12,5)
sns.histplot(train_df['water'], ax=ax1, kde=True)
ax1.set_xlabel('Water', fontsize=15)
ax1.set_title("Distribution Plot")

sns.boxplot(train_df['water'], ax=ax2)
ax2.set_xlabel('Water', fontsize=15)
ax2.set_title("Box Plot")

In [None]:
# @title  Superplasticizer Outliers Visualization
# Superplasticizer
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
fig.set_size_inches(12,5)

sns.histplot(train_df['superplasticizer'], ax=ax1, kde=True)
ax1.set_xlabel('Superplasticizer', fontsize=15)
ax1.set_title("Distribution Plot")

sns.boxplot(train_df['superplasticizer'], ax=ax2)
ax2.set_xlabel('Superplasticizer', fontsize=15)
ax2.set_title("Box Plot")


In [None]:
# @title  Coarseagg Outliers Visualization
# Coarseagg
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
fig.set_size_inches(12,5)
sns.histplot(train_df['coarseaggregate'], ax=ax1, kde=True)
ax1.set_xlabel('Coarseagg', fontsize=15)
ax1.set_title("Distribution Plot")

sns.boxplot(train_df['coarseaggregate'], ax=ax2)
ax2.set_xlabel('Coarseagg', fontsize=15)
ax2.set_title("Box Plot")

In [None]:
# @title  Fineaggregate Outliers Visualization
# Fineaggregate
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
fig.set_size_inches(12,5)
sns.histplot(train_df['fineaggregate'], ax=ax1, kde=True)
ax1.set_xlabel('Fineagg', fontsize=15)
ax1.set_title("Distribution Plot")

sns.boxplot(train_df['fineaggregate'], ax=ax2)
ax2.set_xlabel('Fineagg', fontsize=15)
ax2.set_title("Box Plot")

In [None]:
# @title  Age Outliers Visualization
# Age
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
fig.set_size_inches(12,5)
sns.histplot(train_df['age'], ax=ax1, kde=True)
ax1.set_xlabel('Age', fontsize=15)
ax1.set_title("Distribution Plot")

sns.boxplot(train_df['age'], ax=ax2)
ax2.set_xlabel('Age', fontsize=15)
ax2.set_title("Box Plot")

Q1_age = train_df['age'].quantile(0.25)
Q3_age = train_df['age'].quantile(0.75)
IQR_age = Q3_age - Q1_age
LTV_age = Q1_age - 1.5 * IQR_age
UTV_age = Q3_age + 1.5 * IQR_age

In [None]:
# @title  Strength Outliers Visualization
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
fig.set_size_inches(12,5)
sns.histplot(train_df['csMPa'], ax=ax1, kde=True)
ax1.tick_params(labelsize=15)
ax1.set_xlabel('Strength', fontsize=15)
ax1.set_title("Distribution Plot")

sns.boxplot(train_df['csMPa'], ax=ax2)
ax2.set_title("Box Plot")
ax2.set_xlabel('Strength', fontsize=15)

# **Fixing Outliers**
---

In [None]:
# @title Fix Outliers and Visualize

def fix_outliers(df):
    df_new = df.copy()

    # Iterate over each column (excluding the last one, assuming it's the target variable)
    for col_name in df_new.columns[:-1]:
        q1 = df_new[col_name].quantile(0.25)
        q3 = df_new[col_name].quantile(0.75)
        iqr = q3 - q1
        low = q1 - 1.5 * iqr
        high = q3 + 1.5 * iqr

        # Replace values outside the lower and upper bounds with the median of the column
        df_new.loc[(df_new[col_name] < low) | (df_new[col_name] > high), col_name] = df_new[col_name].median()

    return df_new

# Create a new DataFrame for fixing outliers
train_df_new = fix_outliers(train_df)

# Set the style of seaborn
sns.set(style="whitegrid")

# Plot boxplot after fixing outliers
plt.figure(figsize=(15, 8))
sns.boxplot(data=train_df_new, orient="h", palette="Set2", dodge=False)
plt.title('Box Plot after Fixing Outliers')
plt.show()


In [None]:
# @title Calculate and Visualize Correlation Matrix

def visualize_correlation_matrix(df, cmap="RdPu", annot=True):
    correlation_matrix = df.corr()

    # Create a mask for the upper triangle
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

    # Set up the matplotlib figure with seaborn style
    with sns.axes_style("white"):
        plt.figure(figsize=(7, 6))

        # Draw the heatmap with the mask
        sns.heatmap(correlation_matrix, cmap=cmap, annot=annot, fmt=".2f", linewidths=".25")

        # Set the title with a smaller font size
        plt.title("Correlation Matrix", fontsize=10)

        # Show the plot
        plt.show()

# Visualize the correlation matrix for train_df_new
visualize_correlation_matrix(train_df_new)


In [None]:
# @title KMeans Clustering for Optimal Number of Clusters

def find_optimal_clusters(data, cluster_range):
    cluster_errors = []

    for num_clusters in cluster_range:
        kmeans = KMeans(num_clusters, n_init=5)
        kmeans.fit(data)
        cluster_errors.append(kmeans.inertia_)

    clusters_df = pd.DataFrame({"num_clusters": cluster_range, "cluster_errors": cluster_errors})
    return clusters_df

def plot_elbow_plot(clusters_df):
    plt.figure(figsize=(4, 3))
    plt.plot(clusters_df.num_clusters, clusters_df.cluster_errors, marker="o")
    plt.title('Elbow Plot for Optimal Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Cluster Errors (Inertia)')
    plt.show()

# Set the range of clusters to explore
cluster_range = range(2, 6)

# Find optimal clusters and plot elbow plot
clusters_df = find_optimal_clusters(train_df_new, cluster_range)
display(clusters_df)
plot_elbow_plot(clusters_df)


In [None]:
# @title  Assigne cluster IDs to each data point
num_clusters = 3

# Create a KMeans model with the specified number of clusters
kmeans_model = KMeans(n_clusters=num_clusters, random_state=2354)

# Fit the model to the training data
kmeans_model.fit(train_df_new)

# Predict the cluster labels for each data point
cluster_labels = kmeans_model.predict(train_df_new)

# Assign the cluster labels to the original DataFrame
train_df_new["Cluster_id"] = cluster_labels

# Create a deep copy of the DataFrame with cluster assignments
train_df_new_clustered = train_df_new.copy(deep=True)

In [None]:
# @title Display Cluster Centers
centroids = kmeans_model.cluster_centers_

# 'centroids' is now a NumPy array containing the coordinates of the cluster centers
print("Cluster Centers:")
print(centroids)


In [None]:
# @title Z-Score Scaling

def scale_dataframe(df):
    # Apply z-score scaling to the DataFrame
    df_scaled = df.apply(zscore)
    return df_scaled

# Scale the train_df_new DataFrame
train_df_scaled = scale_dataframe(train_df_new)

# Set the style of seaborn
sns.set(style="whitegrid")

# Plot boxplot after scaling
plt.figure(figsize=(6, 3))
sns.boxplot(data=train_df_scaled, orient="h", palette="Set2", dodge=False)
plt.title('Box Plot after Z-Score Scaling')
plt.show()


# **Negelect theese**

In [None]:
train_df.head()

In [None]:
train_df_new.head()

In [None]:
train_df_scaled.head()

# **DATA** Test Splitting

In [None]:
# @title <s> **Step xx :** Separating  csMPa  </s> --- not in use
# x = train_df_new.drop(columns=['csMPa'])
# y = train_df_new['csMPa']

In [None]:
# @title **Step xx :** Separating **csMPa** (Scaled) and Train-Test Split


def perform_train_test_split(data, target_column='csMPa', test_size=0.2, random_state=7):
    # Separate the target variable
    y = data[target_column]
    X = data.drop(columns=[target_column])

    # Perform train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Display the shapes of the resulting data sets
    print('X_train data shape: {}'.format(X_train.shape))
    print('y_train data shape: {}'.format(y_train.shape))
    print('X_test data shape : {}'.format(X_test.shape))
    print('y_test data shape : {}'.format(y_test.shape))

    return X_train, X_test, y_train, y_test

# Perform the train-test split
x_model_train, x_test, y_model_train, y_test = perform_train_test_split(train_df_scaled)


In [None]:
# @title **Step xx :** Separating **csMPa** (Scaled) and Train-Validation Split
def perform_train_validation_split(X, y, test_size=0.3, random_state=7):
    # Perform train-validation split
    X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Display the shapes of the resulting data sets
    print('X_train data shape: {}'.format(X_train.shape))
    print('y_train data shape: {}'.format(y_train.shape))
    print('X_validate data shape: {}'.format(X_validate.shape))
    print('y_validate data shape: {}'.format(y_validate.shape))

    return X_train, X_validate, y_train, y_validate

# Perform the train-validation split
x_train, x_validate, y_train, y_validate = perform_train_validation_split(x_model_train, y_model_train)


# **Models**

In [None]:
# @title Defining the kFold function for cross-validation


def kfold_cross_validation(model, X, y, n_splits=10, random_state=7):
    # Set the random seed using numpy
    np.random.seed(random_state)

    # Create a KFold object with shuffle=False
    kfold = KFold(n_splits=n_splits, shuffle=False)

    model_scores = []
    model_RMSEs = []
    model_R2s = []

    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Calculate and store model metrics
        score = model.score(X_test, y_test)
        RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
        R2 = r2_score(y_test, y_pred)

        model_scores.append(score)
        model_RMSEs.append(RMSE)
        model_R2s.append(R2)

    return model_scores, model_RMSEs, model_R2s

# Example usage with a linear regression model
linear_model = LinearRegression()
linear_model_scores, linear_model_RMSEs, linear_model_R2s = kfold_cross_validation(linear_model, x_model_train, y_model_train)

# Print or store the results as needed


In [None]:
# @title **Model X.1 :** Linear Regression Model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold

# Set random seed for reproducibility
random_state = 7

# Initialize Linear Regression model
regression_model = LinearRegression()

# Fit the Linear Regression model on the training data
regression_model.fit(x_train, y_train)

# Add a space
print("\n" + "-"*13 + "Linear Regression Model" + "-"*13 + "\n")

# Display coefficients for each independent attribute
for idx, col_name in enumerate(x_train.columns):
    print(f"The coefficient for {col_name} is: {regression_model.coef_[idx]}")

print("\n" + "-"*50 + "\n")

# Display the intercept for the model
intercept = regression_model.intercept_
print(f"Model intercept is {intercept}")

# Evaluate the model on the validation set
lr_score = regression_model.score(x_validate, y_validate)
print(f"Linear Regression Model Score: {lr_score}")

# Calculate RMSE using cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=random_state)
lr_rmse = np.sqrt((-1) * cross_val_score(regression_model, x_train, y_train.values.ravel(), cv=kfold, scoring='neg_mean_squared_error').mean())
print(f"Linear Regression Model RMSE : {lr_rmse}")

# Calculate R-squared using cross-validation
lr_r2 = cross_val_score(regression_model, x_train, y_train.values.ravel(), cv=kfold, scoring='r2').mean()
print(f"Linear Regression Model R-Square Value: {lr_r2}")

# Add a space
print("\n" + "-"*50 + "\n")


In [None]:
# @title **Model X.2 :** Multilinear Regression Model

from sklearn.linear_model import LinearRegression

def display_model_information(model, X, y, X_validate, y_validate, model_name):
    # Initialize the model
    mlr_model = model

    # Fit the model on the training data
    mlr_model.fit(X, y)

    # Add a space
    print(f"\n{'-'*13}{model_name}{'-'*13}\n")

    # Display coefficients for each independent attribute
    print("Coefficients for each independent attribute:")
    for idx, col_name in enumerate(X.columns):
        print(f"The coefficient for {col_name} is: {mlr_model.coef_[idx]}")

    print("\n" + "-"*50 + "\n")

    # Evaluate the model on the validation set
    mlr_score = mlr_model.score(X_validate, y_validate)
    print(f"{model_name} Score:", mlr_score)

    # Calculate RMSE using cross-validation
    mlr_rmse = np.sqrt((-1) * cross_val_score(mlr_model, X, y.values.ravel(), cv=kfold, scoring='neg_mean_squared_error').mean())
    print(f"{model_name} RMSE: {mlr_rmse}")

    # Calculate R-squared using cross-validation
    mlr_r2 = cross_val_score(mlr_model, X, y.values.ravel(), cv=kfold, scoring='r2').mean()
    print(f"{model_name} R-Square Value: {mlr_r2}")

    # Add a space
    print("\n" + "-"*50 + "\n")

# Example usage
mlr_model_instance = LinearRegression()
display_model_information(mlr_model_instance, x_train, y_train, x_validate, y_validate, "Multilinear Regression Model")


In [None]:
# @title <del> **Model xx :**  Train Random Forest Classifier. </del>

'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Convert the target variable to binary labels for classification
threshold = 1
y_train_class = (y_train > threshold).astype(int)
y_test_class = (y_test > threshold).astype(int)

# Create and fit the Random Forest Classifier model
rf_classifier_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_model.fit(x_train, y_train_class)

# Make predictions on the test data
y_predict_class = rf_classifier_model.predict(x_test)

# Evaluate the Random Forest Classifier performance
accuracy = accuracy_score(y_test_class, y_predict_class)
classification_rep = classification_report(y_test_class, y_predict_class)
conf_matrix = confusion_matrix(y_test_class, y_predict_class)

# Print performance metrics
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=['Not High Strength', 'High Strength'],
            yticklabels=['Not High Strength', 'High Strength'])
plt.title('Confusion Matrix for Random Forest Classifier')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()'''


In [None]:
# @title **Model X.3 :** Random Forest Regressor Model
from sklearn.ensemble import RandomForestRegressor

# Define a range of n_estimators for hyperparameter tuning
n_estimators = [int(x) for x in np.linspace(start=10, stop=100, num=3)]

# Initialize a Random Forest Regressor with default parameters
rfTree = RandomForestRegressor()

# Fit the Random Forest Regressor on the training data
rfTree.fit(x_train, y_train.values.ravel())

# Add a space
print("\n" + "-"*13 + "Random Forest Regressor Model" + "-"*13 + "\n")

# Evaluate the model on the training set
rfTree_train_score = rfTree.score(x_train, y_train)
print(f"Training Set Score: {rfTree_train_score:.4f}")

# Evaluate the model on the validation set
rfTree_score = rfTree.score(x_validate, y_validate)
print(f"Validation Set Score: {rfTree_score:.4f}")

# Calculate RMSE using cross-validation
rfTree_rmse = np.sqrt((-1) * cross_val_score(rfTree, x_train, y_train.values.ravel(), cv=kfold, scoring='neg_mean_squared_error').mean())
print(f"RMSE: {rfTree_rmse:.4f}")

# Calculate R-squared using cross-validation
rfTree_r2 = cross_val_score(rfTree, x_train, y_train.values.ravel(), cv=kfold, scoring='r2').mean()
print(f"R-Square Value: {rfTree_r2:.4f}")

# Add a space
print("\n" + "-"*50 + "\n")

# Create a DataFrame with model metrics
rfTree_model_df = pd.DataFrame({'Training Score': [rfTree_train_score],
                                'Validation Score': [rfTree_score],
                                'RMSE': [rfTree_rmse],
                                'R Squared': [rfTree_r2]})
display(rfTree_model_df)

# Add a space
print("\n" + "-"*50 + "\n")

# Evaluate the model on the test set
rfTree_test_score = rfTree.score(x_test, y_test)
print(f"Test Data Set Score: {rfTree_test_score:.4f}")


In [None]:
# @title **Model X.3.1 :** Hyper-tuning Random Forest Regressor - **Gridsearch CV**
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
import sys

# Define the parameter distribution
param_dist = {
    'bootstrap': [True],
    'max_depth': [10],
    'max_features': ['log2'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': sp_randint(5, 11),
    'n_estimators': sp_randint(50, 71)
}

# Create a Random Forest Regressor
rf_model = RandomForestRegressor(random_state=7)

# Create RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=10,
    cv=kfold,
    n_jobs=1,
    verbose=0,
    return_train_score=True,
    random_state=7
)

# Redirect standard output to capture progress
original_stdout = sys.stdout
sys.stdout = sys.stderr

# Fit the random search to the data
print("Fitting RandomizedSearchCV...")
random_search.fit(x_train, y_train.values.ravel())

# Reset standard output
sys.stdout = original_stdout

# Get the best parameters and model
best_params = random_search.best_params_
best_rf_model = random_search.best_estimator_

# Print the best parameters
print("Best Parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

# Add a space
print("\n" + "-"*50 + "\n")

# Fit the best model on the training set
best_rf_model.fit(x_train, y_train.values.ravel())

# Evaluate the best model on the validation set
best_rf_score_val = best_rf_model.score(x_validate, y_validate)
print(f"Validation Set Score: {best_rf_score_val:.4f}")

# Calculate RMSE using cross-validation
best_rf_rmse = np.sqrt((-1) * cross_val_score(best_rf_model, x_train, y_train.values.ravel(), cv=kfold, scoring='neg_mean_squared_error').mean())
print(f"RMSE: {best_rf_rmse:.4f}")

# Calculate R-squared using cross-validation
best_rf_r2 = cross_val_score(best_rf_model, x_train, y_train.values.ravel(), cv=kfold, scoring='r2').mean()
print(f"R-Square Value: {best_rf_r2:.4f}")

# Create a DataFrame with model metrics
best_rf_model_df = pd.DataFrame({'Training Score': [best_rf_model.score(x_train, y_train)],
                                  'Validation Score': [best_rf_score_val],
                                  'RMSE': [best_rf_rmse],
                                  'R Squared': [best_rf_r2]})
display(best_rf_model_df)
print(f"Test Data Set Score: {rfTree_test_score:.4f}")


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from IPython.display import display


# Display information for Random Forest Regressor Model
rfTree_model_df = pd.DataFrame({'Trainng Score': [rfTree_train_score],
                                'Validation Score': [rfTree_score],
                                'RMSE': [rfTree_rmse],
                                'R Squared': [rfTree_r2]})
display(rfTree_model_df)
# Evaluate the model on the test set
rfTree_test_score = rfTree.score(x_test, y_test)
print(f"\nTest Data Set Score (Random Forest): {rfTree_test_score:.4f}")

# Add a space
print("\n" + "-"*50 + "\n")

# Display information for Best Random Forest Regressor Model
best_rf_model_df = pd.DataFrame({'Training Score': [best_rf_model.score(x_train, y_train)],
                                  'Validation Score': [best_rf_score_val],
                                  'RMSE': [best_rf_rmse],
                                  'R Squared': [best_rf_r2]})
display(best_rf_model_df)



# Evaluate the best model on the test set
best_rf_test_score = best_rf_model.score(x_test, y_test)
print(f"\nTest Data Set Score (Best Random Forest): {best_rf_test_score:.4f}")
