# Model Training and Evaluation

## Import Required Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## Load and Inspect Dataset

In [2]:
# Load the dataset
df = pd.read_csv('clean_df.csv')
# Display the first few rows
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [3]:
# ...existing code...
print("Horsepower range:", df['horsepower'].min(), "to", df['horsepower'].max())
# ...existing code...

Horsepower range: 48.0 to 288.0


## Train-Test Split

In [None]:
# For regression, let's predict 'price' using all other columns (excluding non-numeric/categorical for simplicity)
# We'll drop rows with missing or non-numeric 'price'
df = df[pd.to_numeric(df['price'], errors='coerce').notnull()]
df['price'] = pd.to_numeric(df['price'])

# Select features (X) and target (y)
# For simplicity, use only numeric columns except 'price' as features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('price')
X = df[numeric_cols]
y = df['price']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=False)

## Choose and Apply Model (Regressor)

In [5]:
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

## Model Testing and Prediction

In [6]:
# Make predictions on the test set
y_pred = model.predict(X_test)

## Display Accuracy Score

In [7]:
# Calculate and display the R² score
score = r2_score(y_test, y_pred)
print(f"R² score on test set: {score:.4f}")

R² score on test set: 0.7472


## Export Model

In [None]:
# Export the data set joblib
# Save the cleaned DataFrame to a file
import joblib
joblib.dump(model, 'linear_regression_model.joblib')

['linear_regression_model.joblib']