In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
url = "https://www.kaggle.com/datasets/hellbuoy/car-price-prediction"
print("Please download the car price dataset from:", url)
print("Expected filename: CarPrice_Assignment.csv")

In [None]:
try:
    df = pd.read_csv('CarPrice_Assignment.csv')
except:
    print("Dataset not found. Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples = 1000
    
    brands = ['toyota', 'honda', 'bmw', 'audi', 'mercedes-benz', 'volkswagen', 'ford', 'chevrolet', 'nissan', 'hyundai']
    fuel_types = ['gas', 'diesel']
    aspiration_types = ['std', 'turbo']
    door_numbers = ['two', 'four']
    body_styles = ['sedan', 'hatchback', 'wagon', 'hardtop', 'convertible']
    drive_wheels = ['rwd', 'fwd', '4wd']
    engine_locations = ['front', 'rear']
    engine_types = ['dohc', 'ohcv', 'ohc', 'l', 'rotor']
    cylinder_numbers = ['four', 'six', 'five', 'eight', 'two', 'three']
    fuel_systems = ['mpfi', '2bbl', 'mfi', '1bbl', 'spfi', '4bbl', 'idi', 'spdi']
    
    data = []
    for i in range(n_samples):
        brand = np.random.choice(brands)
        
        symboling = np.random.randint(-2, 4)
        normalized_losses = np.random.uniform(65, 256)
        fuel_type = np.random.choice(fuel_types)
        aspiration = np.random.choice(aspiration_types)
        num_doors = np.random.choice(door_numbers)
        body_style = np.random.choice(body_styles)
        drive_wheel = np.random.choice(drive_wheels)
        engine_location = np.random.choice(engine_locations)
        wheel_base = np.random.uniform(86, 120)
        length = np.random.uniform(141, 208)
        width = np.random.uniform(60, 73)
        height = np.random.uniform(47, 60)
        curb_weight = np.random.uniform(1488, 4066)
        engine_type = np.random.choice(engine_types)
        num_cylinders = np.random.choice(cylinder_numbers)
        engine_size = np.random.uniform(61, 326)
        fuel_system = np.random.choice(fuel_systems)
        bore = np.random.uniform(2.54, 3.94)
        stroke = np.random.uniform(2.07, 4.17)
        compression_ratio = np.random.uniform(7, 23)
        horsepower = np.random.uniform(48, 288)
        peak_rpm = np.random.uniform(4150, 6600)
        city_mpg = np.random.uniform(13, 49)
        highway_mpg = np.random.uniform(16, 54)
        
        base_price = 5000
        if brand in ['bmw', 'audi', 'mercedes-benz']:
            base_price += 20000
        elif brand in ['toyota', 'honda']:
            base_price += 8000
        
        price = base_price + horsepower * 50 + engine_size * 30 + curb_weight * 2
        price += np.random.normal(0, 3000)
        price = max(5000, price)
        
        data.append({
            'car_ID': i + 1,
            'symboling': symboling,
            'CarName': f"{brand} model{i}",
            'fueltype': fuel_type,
            'aspiration': aspiration,
            'doornumber': num_doors,
            'carbody': body_style,
            'drivewheel': drive_wheel,
            'enginelocation': engine_location,
            'wheelbase': wheel_base,
            'carlength': length,
            'carwidth': width,
            'carheight': height,
            'curbweight': curb_weight,
            'enginetype': engine_type,
            'cylindernumber': num_cylinders,
            'enginesize': engine_size,
            'fuelsystem': fuel_system,
            'boreratio': bore,
            'stroke': stroke,
            'compressionratio': compression_ratio,
            'horsepower': horsepower,
            'peakrpm': peak_rpm,
            'citympg': city_mpg,
            'highwaympg': highway_mpg,
            'price': price
        })
    
    df = pd.DataFrame(data)

In [None]:
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())

In [None]:
print("\nFirst 5 rows:")
print(df.head())

In [None]:
print("\nDataset Description:")
print(df.describe())

In [None]:
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
if 'CarName' in df.columns:
    df['brand'] = df['CarName'].str.split().str[0].str.lower()

In [None]:
df.fillna(df.median(numeric_only=True), inplace=True)

for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
print("\nPrice Statistics:")
print(f"Average Price: ${df['price'].mean():.2f}")
print(f"Median Price: ${df['price'].median():.2f}")
print(f"Price Range: ${df['price'].min():.2f} - ${df['price'].max():.2f}")

In [None]:
plt.figure(figsize=(20, 15))

plt.subplot(3, 4, 1)
plt.hist(df['price'], bins=30, alpha=0.7, color='blue')
plt.title('Price Distribution')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')

plt.subplot(3, 4, 2)
if 'brand' in df.columns:
    brand_prices = df.groupby('brand')['price'].mean().sort_values(ascending=False).head(10)
    plt.bar(range(len(brand_prices)), brand_prices.values)
    plt.title('Average Price by Brand (Top 10)')
    plt.xlabel('Brand')
    plt.ylabel('Average Price ($)')
    plt.xticks(range(len(brand_prices)), brand_prices.index, rotation=45)

plt.subplot(3, 4, 3)
plt.scatter(df['horsepower'], df['price'], alpha=0.6, color='red')
plt.title('Price vs Horsepower')
plt.xlabel('Horsepower')
plt.ylabel('Price ($)')

plt.subplot(3, 4, 4)
plt.scatter(df['enginesize'], df['price'], alpha=0.6, color='green')
plt.title('Price vs Engine Size')
plt.xlabel('Engine Size')
plt.ylabel('Price ($)')

plt.subplot(3, 4, 5)
plt.scatter(df['curbweight'], df['price'], alpha=0.6, color='purple')
plt.title('Price vs Curb Weight')
plt.xlabel('Curb Weight')
plt.ylabel('Price ($)')

plt.subplot(3, 4, 6)
if 'fueltype' in df.columns:
    sns.boxplot(data=df, x='fueltype', y='price')
    plt.title('Price by Fuel Type')

plt.subplot(3, 4, 7)
if 'carbody' in df.columns:
    sns.boxplot(data=df, x='carbody', y='price')
    plt.title('Price by Car Body')
    plt.xticks(rotation=45)

plt.subplot(3, 4, 8)
if 'drivewheel' in df.columns:
    sns.boxplot(data=df, x='drivewheel', y='price')
    plt.title('Price by Drive Wheel')

plt.subplot(3, 4, 9)
plt.scatter(df['citympg'], df['price'], alpha=0.6, color='orange')
plt.title('Price vs City MPG')
plt.xlabel('City MPG')
plt.ylabel('Price ($)')

plt.subplot(3, 4, 10)
plt.scatter(df['highwaympg'], df['price'], alpha=0.6, color='brown')
plt.title('Price vs Highway MPG')
plt.xlabel('Highway MPG')
plt.ylabel('Price ($)')

plt.subplot(3, 4, 11)
numeric_cols = ['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 
                'enginesize', 'boreratio', 'stroke', 'compressionratio', 
                'horsepower', 'peakrpm', 'citympg', 'highwaympg', 'price']
available_cols = [col for col in numeric_cols if col in df.columns]
if len(available_cols) > 1:
    correlation_matrix = df[available_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
    plt.title('Correlation Matrix')

plt.subplot(3, 4, 12)
if 'cylindernumber' in df.columns:
    cylinder_mapping = {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12}
    df['cylinder_numeric'] = df['cylindernumber'].map(cylinder_mapping)
    sns.boxplot(data=df, x='cylindernumber', y='price')
    plt.title('Price by Number of Cylinders')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
df_model = df.copy()

categorical_columns = df_model.select_dtypes(include=['object']).columns
le_dict = {}

for col in categorical_columns:
    if col not in ['car_ID', 'CarName']:
        le = LabelEncoder()
        df_model[col] = le.fit_transform(df_model[col].astype(str))
        le_dict[col] = le

In [None]:
feature_columns = [col for col in df_model.columns if col not in ['price', 'car_ID', 'CarName']]
X = df_model[feature_columns]
y = df_model['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = {}
predictions = {}

for name, model in models.items():
    if name == 'Linear Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    results[name] = {'MSE': mse, 'R2': r2, 'MAE': mae, 'RMSE': rmse}
    predictions[name] = y_pred
    
    print(f"\n{name} Results:")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")
    print(f"R² Score: {r2:.4f}")
    print(f"Mean Absolute Error: {mae:.2f}")

In [None]:
best_model = max(results, key=lambda x: results[x]['R2'])
print(f"\nBest Model: {best_model} with R² Score: {results[best_model]['R2']:.4f}")

In [None]:
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
model_names = list(results.keys())
r2_scores = [results[name]['R2'] for name in model_names]
bars = plt.bar(model_names, r2_scores, color=['skyblue', 'lightgreen', 'lightcoral'])
plt.title('Model Comparison - R² Scores')
plt.ylabel('R² Score')
plt.xlabel('Models')
for bar, score in zip(bars, r2_scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{score:.3f}', ha='center', va='bottom')

plt.subplot(2, 3, 2)
rmse_scores = [results[name]['RMSE'] for name in model_names]
plt.bar(model_names, rmse_scores, color=['orange', 'pink', 'lightblue'])
plt.title('Model Comparison - RMSE')
plt.ylabel('RMSE')
plt.xlabel('Models')

plt.subplot(2, 3, 3)
for name in model_names:
    plt.scatter(y_test, predictions[name], alpha=0.6, label=name)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.legend()

plt.subplot(2, 3, 4)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
feature_importance = rf_model.feature_importances_
indices = np.argsort(feature_importance)[::-1][:10]
plt.bar(range(len(indices)), feature_importance[indices])
plt.title('Top 10 Feature Importance (Random Forest)')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(range(len(indices)), [feature_columns[i] for i in indices], rotation=45)

plt.subplot(2, 3, 5)
residuals = y_test - predictions['Random Forest']
plt.scatter(predictions['Random Forest'], residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Price')
plt.ylabel('Residuals')
plt.title('Residual Plot (Random Forest)')

plt.subplot(2, 3, 6)
price_ranges = pd.cut(df['price'], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
price_distribution = price_ranges.value_counts()
plt.pie(price_distribution.values, labels=price_distribution.index, autopct='%1.1f%%')
plt.title('Price Range Distribution')

plt.tight_layout()
plt.show()

In [None]:
print("\nFeature Importance Analysis:")
feature_importance_df = pd.DataFrame({
    'Feature': [feature_columns[i] for i in indices[:10]],
    'Importance': feature_importance[indices[:10]]
})
print(feature_importance_df.round(4))

In [None]:
print("\nKey Insights:")
print("1. Car price prediction models trained and evaluated")
print("2. Random Forest and Gradient Boosting typically perform better than Linear Regression")
print("3. Important features for price prediction identified")
print("4. Model performance metrics calculated for comparison")
print("5. Comprehensive visualizations created for data exploration and model evaluation")