In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load the dataset
file_path = 'cars.csv'
df = pd.read_csv(file_path)

# Inspect the data
print(df.head())
print(df.describe())
print(df.dtypes)

# Check for missing values
print(df.isnull().sum())

# Prepare the data
# Assuming 'price' is the target variable
X = df.drop(columns=['Price'])
y = df['Price']

# Convert categorical variables to numerical (if any)
X = pd.get_dummies(X, drop_first=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets with test size 0.1
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

# Initialize and train the models with hyperparameter tuning
# Using GridSearchCV for hyperparameter tuning

# Define the hyperparameters grid for Lasso and Ridge
param_grid = {'alpha': [0.01, 0.1, 1.0, 10, 100]}

# Initialize the models
linear_regression = LinearRegression()
lasso = Lasso()
ridge = Ridge()

# Perform GridSearchCV for Lasso and Ridge
lasso_cv = GridSearchCV(lasso, param_grid, cv=5)
ridge_cv = GridSearchCV(ridge, param_grid, cv=5)

# Fit the models
linear_regression.fit(X_train, y_train)
lasso_cv.fit(X_train, y_train)
ridge_cv.fit(X_train, y_train)

# Predict and evaluate the models
models = {
    'Linear Regression': linear_regression,
    'Lasso Regression': lasso_cv.best_estimator_,
    'Ridge Regression': ridge_cv.best_estimator_
}

results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}

# Print results
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  Mean Squared Error: {metrics['MSE']:.2f}")
    print(f"  R-squared: {metrics['R2']:.2f}")
    print()

# Determine the best model based on R-squared value
best_model_name = max(results, key=lambda k: results[k]['R2'])
best_model_metrics = results[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"  Mean Squared Error: {best_model_metrics['MSE']:.2f}")
print(f"  R-squared: {best_model_metrics['R2']:.2f}")

# Provide a clear business solution based on the best model
if best_model_name == 'Lasso Regression':
    business_solution = "Lasso Regression provides the best model. This model can handle feature selection by reducing the coefficients of less important features to zero. The business should focus on the features with significant coefficients to optimize car prices."
elif best_model_name == 'Ridge Regression':
    business_solution = "Ridge Regression provides the best model. This model is useful for handling multicollinearity among features. The business should ensure that all significant features are optimized collectively to improve car prices."
else:
    business_solution = "Linear Regression provides the best model. The business should focus on the linear relationship between features and car prices to optimize their pricing strategy."

print(business_solution)


   Car_ID    Brand    Model  Year  Kilometers_Driven Fuel_Type Transmission  \
0       1   Toyota  Corolla  2018              50000    Petrol       Manual   
1       2    Honda    Civic  2019              40000    Petrol    Automatic   
2       3     Ford  Mustang  2017              20000    Petrol    Automatic   
3       4   Maruti    Swift  2020              30000    Diesel       Manual   
4       5  Hyundai   Sonata  2016              60000    Diesel    Automatic   

  Owner_Type  Mileage  Engine  Power  Seats    Price  
0      First       15    1498    108      5   800000  
1     Second       17    1597    140      5  1000000  
2      First       10    4951    395      4  2500000  
3      Third       23    1248     74      5   600000  
4     Second       18    1999    194      5   850000  
           Car_ID        Year  Kilometers_Driven     Mileage       Engine  \
count  100.000000   100.00000         100.000000  100.000000   100.000000   
mean    50.500000  2018.39000       28150

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Linear Regression:
  Mean Squared Error: 31621990168.61
  R-squared: 0.89

Lasso Regression:
  Mean Squared Error: 54769420518.22
  R-squared: 0.81

Ridge Regression:
  Mean Squared Error: 39687915402.24
  R-squared: 0.86

Best Model: Linear Regression
  Mean Squared Error: 31621990168.61
  R-squared: 0.89
Linear Regression provides the best model. The business should focus on the linear relationship between features and car prices to optimize their pricing strategy.
