In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

# Load data
train_data = pd.read_csv('main_dataset.csv')
train_data['Id'] = np.where(train_data['Id'] < 1e-18, 1e-18, train_data['Id'])
train_data['Log_Id'] = np.log10(train_data['Id'])
X = train_data[['Tox', 'Nc', 'Nd', 'Ns', 'Vds', 'Vgs']]
y = train_data['Log_Id']

# Polynomial features and scaling
poly = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly.fit_transform(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Define hyperparameter grid for DecisionTreeRegressor
param_grid = {
    'max_depth': [3, 5, 10, 15,None],
    'min_samples_split': [2, 5, 10, 50, 100, None],
    'min_samples_leaf': [1, 2, 4, 10, None],
    'max_features': [None, 'sqrt', 'log2'],
    'min_impurity_decrease' :[0.0, 0.001, 0.01, 0.1, None],
}

# DataFrame to store results
results_df = pd.DataFrame(columns=['max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features', 'min_impurity_decrease', 'R2', 'MAE'])


random_search = RandomizedSearchCV(
    DecisionTreeRegressor(random_state=42), 
    param_grid, 
    n_iter=40, 
    random_state=42, 
    n_jobs=-1
)

random_search.fit(X_train, y_train)


# Loop through each hyperparameter set
for params in random_search.cv_results_['params']:
    # Set up the model with the current parameters
    model = DecisionTreeRegressor(random_state=42, **params)
    model.fit(X_train, y_train)
    
    # Test on the first test dataset
    y_pred_test = model.predict(X_test)
    r2 = r2_score(y_test, y_pred_test)
    mae = mean_absolute_error(y_test, y_pred_test)
    
    # Load and preprocess the second test data
    test_data = pd.read_csv('test_data.csv').iloc[0:203]
    test_data['Id'] = np.where(test_data['Id'] < 1e-18, 1e-18, test_data['Id'])
    test_data['Log_Id'] = np.log10(test_data['Id'])
    X_test_1 = test_data[['Tox', 'Nc', 'Nd', 'Ns', 'Vds', 'Vgs']]
    y_test_1 = test_data['Log_Id']
    X_test_1_transformed = poly.transform(X_test_1)
    X_test_1_scaled = scaler.transform(X_test_1_transformed)

    # Predict on the second test dataset and evaluate
    y_pred_test_1 = model.predict(X_test_1_scaled)
    r2_test_1 = r2_score(y_test_1, y_pred_test_1)
    mae_test_1 = mean_absolute_error(y_test_1, y_pred_test_1)

    # Log scale plot
    plt.figure(figsize=(10, 5))
    plt.plot(X_test_1['Vgs'], y_pred_test_1, color="green", label="Predicted")
    plt.plot(X_test_1['Vgs'], y_test_1, color="blue", label="Actual")
    plt.title(f'Vgs vs Id (Log scale) - Params: {params}')
    plt.xlabel('Vgs')
    plt.ylabel('Log10(Id)')
    plt.legend()
    plt.show()
    
    # Linear scale plot
    plt.figure(figsize=(10, 5))
    plt.plot(X_test_1['Vgs'], np.maximum(np.power(10, y_pred_test_1), 1e-18), color="green", label="Predicted")
    plt.plot(X_test_1['Vgs'], np.maximum(np.power(10, y_test_1), 1e-18), color="blue", label="Actual")
    plt.title(f'Vgs vs Id (Linear scale) - Params: {params}')
    plt.xlabel('Vgs')
    plt.ylabel('Id')
    plt.legend()
    plt.show()

    # Save to DataFrame
    results_df = results_df.append({
        'max_depth': params['max_depth'],
        'min_samples_split': params['min_samples_split'],
        'min_samples_leaf': params['min_samples_leaf'],
        'max_features': params['max_features'],
        'min_impurity_decrease' : params['min_impurity_decrease'],
        'R2': r2_test_1,
        'MAE': mae_test_1
    }, ignore_index=True)

# Display results
print(results_df)
