In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset with the correct delimiter
data = pd.read_csv('ev_charging.csv', delimiter=',')

# Display the column names
print("Columns in the dataset:", data.columns)

# Check if only one column is present and split it if necessary
if len(data.columns) == 1:
    data = data['vehicle_ID,Session_ID,Energy_Consumed_KWh,Charging_Duration,Cost'].str.split(',', expand=True)
    data.columns = ['Vehicle_ID', 'Session_ID', 'Energy_Consumed_KWh', 'Charging_Duration', 'Cost']

# Convert relevant columns to numeric types
data['Energy_Consumed_KWh'] = pd.to_numeric(data['Energy_Consumed_KWh'], errors='coerce')
data['Charging_Duration'] = pd.to_numeric(data['Charging_Duration'], errors='coerce')
data['Cost'] = pd.to_numeric(data['Cost'], errors='coerce')

# Define features and target
X = data[['Energy_Consumed_KWh', 'Charging_Duration']]
y = data['Cost']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'SVR': SVR()
}

# Define hyperparameters with correct prefixes
param_grids = {
    'LinearRegression': {},
    'RandomForest': {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [None, 10, 20, 30]
    },
    'SVR': {
        'regressor__C': [0.1, 1, 10],
        'regressor__kernel': ['linear', 'rbf']
    }
}

# Initialize variables to track the best model
best_model = None
best_score = float('inf')
best_model_name = ""

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")

    # Create a pipeline with scaler and model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', model)
    ])

    # GridSearch with prefixed parameter names
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Evaluate on test set
    y_pred = grid_search.best_estimator_.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{model_name} - Best MSE: {mse}, R^2: {r2}")

    # Track the best performing model
    if mse < best_score:
        best_score = mse
        best_model = grid_search.best_estimator_
        best_model_name = model_name

print(f"\nBest Model: {best_model_name} with MSE: {best_score}")

# Save the best model
joblib.dump(best_model, 'best_model.pkl')
print("Best model saved as best_model.pkl")


Columns in the dataset: Index(['vehicle_ID,Session_ID,Energy_Consumed_KWh,Charging_Duration,Cost'], dtype='object')
Training LinearRegression...
LinearRegression - Best MSE: 0.6085502995475772, R^2: 0.9748095419012587
Training RandomForest...
RandomForest - Best MSE: 0.39813374999999984, R^2: 0.9835195684653744
Training SVR...
SVR - Best MSE: 0.545075165639117, R^2: 0.9774370448409858

Best Model: RandomForest with MSE: 0.39813374999999984
Best model saved as best_model.pkl
