In [None]:
# Step 1: Install TPOT (if not already installed)
!pip install tpot



In [None]:
# Step 2: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [None]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/ML/AP Research/supermarket_sales - Sheet1.csv')

In [None]:
# Step 4: Preprocess the data
# Drop irrelevant columns (if any)
# For example, 'Invoice ID' is likely not useful for prediction
data = data.drop(columns=['Invoice ID'])

In [None]:
# Handle missing values (if any)
data = data.fillna('Unknown')      # Fill missing categorical values with 'Unknown

In [None]:
# Convert categorical variables to dummy/indicator variables
data = pd.get_dummies(data, drop_first=True)

In [None]:
# Step 5: Define features (X) and target (y)
# Assuming 'Total' is the target variable
X = data.drop(columns=['Total'])
y = data['Total']

In [None]:

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 7: Initialize TPOTRegressor
# Use compatible parameters for Google Colab
tpot = TPOTRegressor(
    generations=5,                # Number of iterations to run
    population_size=20,           # Number of individuals in each generation
    random_state=42,              # For reproducibility
    n_jobs=1,                    # Use all available CPU cores
        max_time_mins=15,# Show progress (if supported)
)


In [None]:

# Step 8: Fit TPOT on the training data
print("Training TPOT...")
tpot.fit(X_train, y_train)

Training TPOT...


Perhaps you already have a cluster running?
Hosting the HTTP server on port 35979 instead
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:37391
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:35979/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:44911'
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:39709 name: 0
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:39709
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:34008
INFO:distributed.scheduler:Receive client connection: Client-67d3eba4-01c5-11f0-8099-0242ac1c000c
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:41362

  0%|          | 0/5 [00:00<?, ?it/s][A
Generation:   0%|          | 0/5 [15:19<?, ?it/s]
INFO:distributed.scheduler:Client Client-67d3eba4-01c5-11f0-8099-0242ac1c000c requests t

In [None]:

# Step 10: Evaluate the model on the test set
y_pred = tpot.predict(X_test)

In [None]:

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Test MSE: {mse}")
print(f"Test RMSE: {rmse}")
print(f"Test R^2: {r2}")

Test MSE: 2.0531209463644143e-26
Test RMSE: 1.4328715735767859e-13
Test R^2: 1.0
