In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [None]:
# Load dataset
data_path = "car_price.csv"
df = pd.read_csv(data_path)

# Display dataset info
print("Dataset Head:")
print(df.head())
print("\nDataset Info:")
df.info()


Dataset Head:
   Unnamed: 0     name  year  selling_price  km_driven    fuel seller_type  \
0           0   Maruti  2014         450000     145500  Diesel  Individual   
1           2  Hyundai  2010         225000     127000  Diesel  Individual   
2           4  Hyundai  2017         440000      45000  Petrol  Individual   
3           7   Toyota  2011         350000      90000  Diesel  Individual   
4           8     Ford  2013         200000     169000  Diesel  Individual   

  transmission        owner  seats  max_power (in bph) Mileage Unit  Mileage  \
0       Manual  First Owner      5               74.00         kmpl    23.40   
1       Manual  First Owner      5               90.00         kmpl    23.00   
2       Manual  First Owner      5               81.86         kmpl    20.14   
3       Manual  First Owner      5               67.10         kmpl    23.59   
4       Manual  First Owner      5               68.10         kmpl    20.00   

   Engine (CC)  
0         1248  
1 

In [None]:

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())



Missing Values:
Unnamed: 0            0
name                  0
year                  0
selling_price         0
km_driven             0
fuel                  0
seller_type           0
transmission          0
owner                 0
seats                 0
max_power (in bph)    0
Mileage Unit          0
Mileage               0
Engine (CC)           0
dtype: int64


In [None]:

# Encode categorical features
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Feature-target split
X = df.drop(columns=['selling_price'])  # Assuming 'price' is the target column
y = df['selling_price']

# Feature selection using correlation
corr_matrix = df.corr()
high_corr_features = corr_matrix.index[abs(corr_matrix["selling_price"]) > 0.3]
X = df[high_corr_features].drop(columns=["selling_price"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:

# Model training with regularization
models = {
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    results.append({"Model": name, "MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r2})

    print(f"\n{name} Performance:")
    print("MAE:", mae)
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("R2 Score:", r2)

# Convert results into a DataFrame for easy comparison
results_df = pd.DataFrame(results)
print("\nModel Performance Comparison:")
print(results_df)

# Insights:
# - Ridge and Lasso Regression add regularization to prevent overfitting.
# - Lasso also performs feature selection by reducing some coefficients to zero.
# - Random Forest may provide better predictions by capturing non-linear relationships.



Ridge Regression Performance:
MAE: 87461.92683799348
MSE: 13211697124.78236
RMSE: 114942.146859985
R2 Score: 0.7205055556597223

Lasso Regression Performance:
MAE: 87466.53627305046
MSE: 13214410049.376007
RMSE: 114953.94751541161
R2 Score: 0.7204481635363125

Random Forest Performance:
MAE: 53415.79585736254
MSE: 6430123187.492591
RMSE: 80188.04890688257
R2 Score: 0.8639702613257279

Model Performance Comparison:
              Model           MAE           MSE           RMSE  R2 Score
0  Ridge Regression  87461.926838  1.321170e+10  114942.146860  0.720506
1  Lasso Regression  87466.536273  1.321441e+10  114953.947515  0.720448
2     Random Forest  53415.795857  6.430123e+09   80188.048907  0.863970
