In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Function to load and prepare dataset from Excel
def load_dataset(file_path, target_column):
    # Load dataset from Excel
    df = pd.read_excel(file_path)
   
    # Select relevant columns for prediction
    columns_to_keep = ['S04 (ml)', 'H2O (ml)', 'Cathode Weight (g)', 'Volts', 'Amps 0 Mins',
                       'Amps 15 Mins', 'Amps 30 Mins', 'Amps 45 Mins', 'Amps 60 Mins', 'End Copper (g)',
                       'Remaining Solution (ml)', 'Reaction Time Duration (mins)', 'Copper Sulfate Yield']
   
    # Check if all columns in columns_to_keep exist in df
    for col in columns_to_keep:
        if col not in df.columns:
            raise KeyError(f"Column '{col}' not found in the dataset.")
   
    df = df[columns_to_keep]
   
    # Handling missing values
    imputer = SimpleImputer(strategy='mean')
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
   
    # Split dataset into features and target
    X = df_imputed.drop(target_column, axis=1)
    y = df_imputed[target_column]
   
    # Normalize or scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
   
    # Split data into training and testing sets with larger test size
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
   
    return X_train, X_test, y_train, y_test, df

# Function to train Decision Tree model
def train_decision_tree(X_train, y_train):
    # Initialize Decision Tree model
    dt_model = DecisionTreeRegressor(random_state=42)
    # Train the model
    dt_model.fit(X_train, y_train)
    return dt_model

# Function to evaluate Decision Tree model
def evaluate_decision_tree(model, X_test, y_test):
    # Predict on test data
    y_pred = model.predict(X_test)
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'R^2 Score: {r2:.2f}')
    return y_pred

# Function to plot decision tree
def plot_decision_tree(model, feature_names):
    plt.figure(figsize=(20, 10))
    plot_tree(model, filled=True, feature_names=feature_names, rounded=True)
    plt.title('Decision Tree for Copper Sulfate Yield Prediction')
    plt.show()

if __name__ == '__main__':
    # File path and target column name
    file_path = r'C:\Users\anthony.stewart\Downloads\Copper Sulfate (14).xlsx'
    target_column = 'Copper Sulfate Yield'  # Adjust based on your actual target column name

    # Load dataset
    X_train, X_test, y_train, y_test, df = load_dataset(file_path, target_column)
   
    # Print sizes of training and test sets
    print(f"Size of X_train: {X_train.shape}")
    print(f"Size of X_test: {X_test.shape}")
    print(f"Size of y_train: {y_train.shape}")
    print(f"Size of y_test: {y_test.shape}")
   
    # Train Decision Tree model
    dt_model = train_decision_tree(X_train, y_train)

    # Evaluate Decision Tree model and get predictions
    y_pred = evaluate_decision_tree(dt_model, X_test, y_test)
   
    # Print predicted yields
    print("Predicted Yields:")
    print(y_pred)

    # Plot decision tree
    feature_names = df.drop(target_column, axis=1).columns  # Assuming feature names are the column names of X_train
    plot_decision_tree(dt_model, feature_names)