In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score

# Load the dataset
df = pd.read_csv("../data/fuel.csv")

# Drop unnecessary columns
df.drop(["temp_inside", "temp_outside", "specials", "refill liters", "refill gas"], inplace=True, axis=1)

# Convert 'distance' and 'consume' to numerical data types
df['distance'] = pd.to_numeric(df['distance'], errors='coerce')
df['consume'] = pd.to_numeric(df['consume'], errors='coerce')

# Handle missing data if any
df.dropna(inplace=True)

# Convert categorical variables to the appropriate data type
df['gas_type'] = df['gas_type'].astype('category')
df['AC'] = df['AC'].astype('category')
df['rain'] = df['rain'].astype('category')
df['sun'] = df['sun'].astype('category')

# Split features and target variable
X = df.drop(columns=['consume'])
y = df['consume']

# One-hot encode categorical variables
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define regression models
models = {
    'SVR': SVR(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Polynomial': make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    'Linear': LinearRegression()
}

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R-squared Score: {r2:.2f}")


R-squared Score: 0.92


In [5]:
import joblib

# Assuming you have already trained and named your Gradient Boosting model as 'gb_model'
# You can save the model using joblib
joblib.dump(model, './model_checkpoints/gradient_boosting_model.pkl')


FileNotFoundError: [Errno 2] No such file or directory: './model_checkpoints/gradient_boosting_model.pkl'

In [6]:
df.dtypes

distance     float64
consume      float64
speed          int64
gas_type    category
AC          category
rain        category
sun         category
dtype: object

In [7]:
df

Unnamed: 0,distance,consume,speed,gas_type,AC,rain,sun
0,28.0,5.0,26,E10,0,0,0
87,13.0,5.0,45,SP98,0,1,0
220,29.0,4.0,27,SP98,0,0,0
258,2.0,6.0,22,E10,0,0,0
259,14.0,5.0,41,E10,0,0,0
266,16.0,4.0,40,E10,0,0,0
301,19.0,5.0,35,SP98,1,0,1
340,16.0,4.0,43,E10,0,0,0
345,16.0,4.0,42,E10,0,0,0
368,16.0,4.0,43,SP98,1,0,0
