## Load File

In [None]:
import pandas as pd
df = pd.read_csv('Automobile.csv')
print(f"Shape of the DataFrame: {df.shape}")
display(df.head())
# df.dropna(inplace=True)

## Cleaning

In [None]:
for col in ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']:
    df[col] = df[col].fillna(df[col].median())
df['num-of-doors'].fillna(df['num-of-doors'].mode()[0], inplace=True)

for col in ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

## Splitting

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Feature Selection (justify choices in comments)
features = ['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'num-of-cylinders', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price', 'normalized-losses']
# Selected features based on intuition and previous exploration.  Features like 'make', 'fuel-type', etc., are likely to be influential in determining insurance risk.

X = df[features]
y = df['symboling']

# One-Hot Encoding
categorical_features = ['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'num-of-cylinders']
numerical_features = ['wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price', 'normalized-losses']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

X_processed = preprocessor.fit_transform(X)

# Create a new dataframe with the processed features
X_processed_df = pd.DataFrame(X_processed)

display(X_processed_df.head())
###


# Assuming 'y' is already defined as the target variable (symboling)
y = df['symboling']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed_df, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

## Training

In [None]:
from sklearn.linear_model import LinearRegression

# Instantiate a LinearRegression object
linear_regression_model = LinearRegression()

# Train the model using the training data
linear_regression_model.fit(X_train, y_train)

## Evalauasi

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Make predictions on the test set
y_pred = linear_regression_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics along with formulas and explanations
print("Evaluation Metrics:")

print("\n1. Root Mean Squared Error (RMSE)")
print("Formula: RMSE = sqrt(MSE) = sqrt(1/n * Σ(yi - ŷi)^2)")
print("Meaning: RMSE measures the average difference between the predicted and actual values in the same unit as the target variable.  Lower RMSE indicates better model performance.  A good score is close to 0, indicating that the model's predictions are very accurate.  A bad score is a high value, meaning the model's predictions are significantly off.")
print(f"RMSE: {rmse}")

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Symboling")
plt.ylabel("Predicted Symboling")
plt.title("Actual vs Predicted Symboling")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red') # Add a diagonal line for reference
plt.show()

print("\n2. Mean Squared Error (MSE)")
print("Formula: MSE = 1/n * Σ(yi - ŷi)^2")
print("Meaning: MSE calculates the average squared difference between predicted and actual values.  It penalizes larger errors more heavily than smaller errors.  Similar to RMSE, a lower MSE signifies better performance.  A good score is close to 0, and a bad score is a large value.")
print(f"MSE: {mse}")

residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals)
plt.xlabel("Predicted Symboling")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.axhline(y=0, color='red', linestyle='--')  # Add a horizontal line at y=0
plt.show()

print("\n3. R-squared (R2)")
print("Formula: R^2 = 1 - (SSres / SStot) = 1 - [Σ(yi - ŷi)^2 / Σ(yi - ȳ)^2]")
print("Meaning: R-squared represents the proportion of variance in the target variable explained by the model.  It ranges from 0 to 1, where 1 indicates that the model perfectly fits the data.  A good score is close to 1, indicating that a high proportion of the variance is explained. A bad score is close to 0, suggesting the model doesn't explain much of the variance.")
print(f"R-squared: {r2}")

plt.figure(figsize=(8, 6))
plt.hist(residuals, bins=15)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Distribution of Residuals")
plt.show()
