In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Create synthetic car data
n_samples = 5000

# Car brands with different price tiers
brands_tier1 = ['Tesla', 'Mercedes', 'BMW', 'Audi', 'Porsche']
brands_tier2 = ['Lexus', 'Volvo', 'Jaguar', 'Land Rover']
brands_tier3 = ['Toyota', 'Honda', 'Ford', 'Chevrolet', 'Nissan', 'Hyundai', 'Kia']
all_brands = brands_tier1 + brands_tier2 + brands_tier3

# Generate data
data = {
    'car_id': range(1, n_samples + 1),
    'brand': np.random.choice(all_brands, n_samples),
    'model': np.random.choice(['Sedan', 'SUV', 'Truck', 'Coupe', 'Convertible', 'Hatchback', 'Minivan'], n_samples),
    'year': np.random.randint(2000, 2024, n_samples),
    'mileage': np.random.exponential(50000, n_samples).astype(int),
    'engine_size': np.round(np.random.uniform(1.0, 6.0, n_samples), 1),
    'fuel_type': np.random.choice(['Petrol', 'Diesel', 'Hybrid', 'Electric'], n_samples, p=[0.4, 0.3, 0.2, 0.1]),
    'transmission': np.random.choice(['Automatic', 'Manual', 'CVT'], n_samples, p=[0.7, 0.25, 0.05]),
    'owner_count': np.random.randint(0, 6, n_samples),
    'accident_history': np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
    'service_history': np.random.choice(['Full', 'Partial', 'None'], n_samples, p=[0.6, 0.3, 0.1]),
    'color': np.random.choice(['Black', 'White', 'Silver', 'Gray', 'Red', 'Blue', 'Green'], n_samples),
}

# Create DataFrame
df = pd.DataFrame(data)

# Calculate base price based on features
base_price = 20000

# Adjust price based on brand tier
def get_brand_multiplier(brand):
    if brand in brands_tier1:
        return 2.5
    elif brand in brands_tier2:
        return 1.8
    else:
        return 1.0

# Adjust price based on model type
def get_model_multiplier(model):
    multipliers = {
        'SUV': 1.3,
        'Truck': 1.2,
        'Coupe': 1.1,
        'Convertible': 1.4,
        'Sedan': 1.0,
        'Hatchback': 0.9,
        'Minivan': 0.95
    }
    return multipliers.get(model, 1.0)

# Calculate price for each car
prices = []
for i in range(n_samples):
    price = base_price
    
    # Brand multiplier
    brand_mult = get_brand_multiplier(df.loc[i, 'brand'])
    price *= brand_mult
    
    # Model multiplier
    model_mult = get_model_multiplier(df.loc[i, 'model'])
    price *= model_mult
    
    # Year effect ($1000 per year from 2000)
    year_effect = (df.loc[i, 'year'] - 2000) * 800
    price += year_effect
    
    # Mileage effect (-$0.08 per mile)
    mileage_effect = -df.loc[i, 'mileage'] * 0.08
    price += mileage_effect
    
    # Engine size effect (+$1500 per liter)
    engine_effect = df.loc[i, 'engine_size'] * 1500
    price += engine_effect
    
    # Fuel type premium
    if df.loc[i, 'fuel_type'] == 'Electric':
        price += 15000
    elif df.loc[i, 'fuel_type'] == 'Hybrid':
        price += 8000
    elif df.loc[i, 'fuel_type'] == 'Diesel':
        price += 3000
    
    # Transmission effect
    if df.loc[i, 'transmission'] == 'Manual':
        price -= 2000
    elif df.loc[i, 'transmission'] == 'CVT':
        price += 1000
    
    # Owner count effect (-$1500 per owner)
    owner_effect = -df.loc[i, 'owner_count'] * 1500
    price += owner_effect
    
    # Accident history effect (-35% if accident)
    if df.loc[i, 'accident_history'] == 1:
        price *= 0.65
    
    # Service history effect
    if df.loc[i, 'service_history'] == 'Partial':
        price *= 0.9
    elif df.loc[i, 'service_history'] == 'None':
        price *= 0.7
    
    # Add some randomness
    price += np.random.normal(0, 5000)
    
    # Ensure minimum price
    price = max(price, 3000)
    
    prices.append(price)

# Add price column
df['price'] = np.round(prices, 2)

# Save to CSV
df.to_csv('car_price.csv', index=False)
print(f"Created car_price.csv with {n_samples} records")
print(f"Price range: ${df['price'].min():,.2f} to ${df['price'].max():,.2f}")
print(f"Average price: ${df['price'].mean():,.2f}")

# Show sample data
print("\nSample data:")
print(df.head())

Created car_price.csv with 5000 records
Price range: $3,000.00 to $106,048.11
Average price: $43,078.93

Sample data:
   car_id      brand    model  year  mileage  engine_size fuel_type  \
0       1      Volvo    Truck  2008    27807          1.6    Hybrid   
1       2       Audi  Minivan  2003   109965          1.9    Petrol   
2       3  Chevrolet  Minivan  2018    93289          2.1    Hybrid   
3       4    Hyundai  Minivan  2023    37267          3.8    Diesel   
4       5      Honda    Coupe  2013    58802          4.9    Petrol   

  transmission  owner_count  accident_history service_history   color  \
0    Automatic            2                 0            None    Gray   
1    Automatic            3                 0         Partial  Silver   
2          CVT            0                 0         Partial    Blue   
3       Manual            3                 0            Full   Black   
4    Automatic            1                 0         Partial   Green   

      price  
0 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
import json
warnings.filterwarnings('ignore')

# Load the data
print("Loading data...")
df = pd.read_csv('car_price.csv')

# Remove car_id and color columns as they're not useful for prediction
df = df.drop(['car_id', 'color'], axis=1)

print(f"Dataset shape: {df.shape}")
print(f"Features: {list(df.columns[:-1])}")
print(f"Target: price")

# Prepare features and target
X = df.drop('price', axis=1)
y = df['price']

# Define categorical and numerical features
categorical_features = ['brand', 'model', 'fuel_type', 'transmission', 'service_history']
numerical_features = ['year', 'mileage', 'engine_size', 'owner_count', 'accident_history']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Create the model pipeline
print("\nCreating model pipeline...")
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        min_samples_split=5,
        min_samples_leaf=2,
        subsample=0.8,
        max_features='sqrt',
        random_state=42,
        n_iter_no_change=10,
        validation_fraction=0.1
    ))
])

# Split data
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Train model
print("\nTraining model...")
model.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics
print("\n" + "="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)

print("\nTraining Set:")
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

print(f"MAE:  ${train_mae:,.2f}")
print(f"MSE:  ${train_mse:,.2f}")
print(f"RMSE: ${train_rmse:,.2f}")
print(f"R²:   {train_r2:.4f}")

print("\nTest Set:")
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print(f"MAE:  ${test_mae:,.2f}")
print(f"MSE:  ${test_mse:,.2f}")
print(f"RMSE: ${test_rmse:,.2f}")
print(f"R²:   {test_r2:.4f}")

# Cross-validation
print("\nCross-validation scores (5-fold):")
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"R² scores: {cv_scores}")
print(f"Mean R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Get feature names after one-hot encoding
print("\nExtracting feature information...")
preprocessor.fit(X)
cat_encoder = preprocessor.named_transformers_['cat']
cat_feature_names = cat_encoder.get_feature_names_out(categorical_features)
all_feature_names = numerical_features + list(cat_feature_names)

# Prepare preprocessing information
feature_info = {
    'categorical_features': categorical_features,
    'numerical_features': numerical_features,
    'all_features': list(X.columns),
    'transformed_features': all_feature_names.tolist(),
    'categorical_categories': {},
    'numerical_stats': {},
    'price_stats': {
        'min': float(y.min()),
        'max': float(y.max()),
        'mean': float(y.mean()),
        'std': float(y.std())
    }
}

# Add categorical categories
for feature in categorical_features:
    unique_values = X[feature].unique().tolist()
    feature_info['categorical_categories'][feature] = unique_values

# Add numerical statistics
for feature in numerical_features:
    stats = {
        'min': float(X[feature].min()),
        'max': float(X[feature].max()),
        'mean': float(X[feature].mean()),
        'std': float(X[feature].std())
    }
    feature_info['numerical_stats'][feature] = stats

# Add model metadata
feature_info['model_metadata'] = {
    'model_type': 'GradientBoostingRegressor',
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'performance': {
        'train_r2': float(train_r2),
        'test_r2': float(test_r2),
        'train_mae': float(train_mae),
        'test_mae': float(test_mae),
        'train_rmse': float(train_rmse),
        'test_rmse': float(test_rmse)
    }
}

# Save the model
print("\nSaving model and preprocessing files...")
import os
os.makedirs('model', exist_ok=True)

# Save model
model_path = 'model/car_price_model.pkl'
joblib.dump(model, model_path)
print(f"✓ Model saved to {model_path}")

# Save preprocessing info
preprocessing_path = 'model/preprocessing.pkl'
joblib.dump(feature_info, preprocessing_path)
print(f"✓ Preprocessing info saved to {preprocessing_path}")

# Also save as JSON for easy reading
json_path = 'model/preprocessing_info.json'
with open(json_path, 'w') as f:
    json.dump(feature_info, f, indent=2, default=str)
print(f"✓ JSON info saved to {json_path}")

# Test the saved model
print("\nTesting saved model...")
loaded_model = joblib.load(model_path)
loaded_info = joblib.load(preprocessing_path)

# Create a test sample
test_sample = pd.DataFrame({
    'brand': ['Toyota'],
    'model': ['SUV'],
    'year': [2020],
    'mileage': [30000],
    'engine_size': [2.5],
    'fuel_type': ['Hybrid'],
    'transmission': ['Automatic'],
    'owner_count': [1],
    'accident_history': [0],
    'service_history': ['Full']
})

# Make prediction
prediction = loaded_model.predict(test_sample)[0]
print(f"\nTest prediction for sample car:")
print(f"Features: 2020 Toyota SUV, 30,000 miles, Hybrid, 1 owner, no accidents")
print(f"Predicted price: ${prediction:,.2f}")

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print("="*60)
print(f"\nSummary:")
print(f"- Dataset size: {len(df)} cars")
print(f"- Features used: {len(feature_info['all_features'])}")
print(f"- Model R² score: {test_r2:.4f}")
print(f"- Average error: ${test_mae:,.2f}")
print(f"- Price range in dataset: ${feature_info['price_stats']['min']:,.2f} to ${feature_info['price_stats']['max']:,.2f}")

Loading data...
Dataset shape: (5000, 11)
Features: ['brand', 'model', 'year', 'mileage', 'engine_size', 'fuel_type', 'transmission', 'owner_count', 'accident_history', 'service_history']
Target: price

Creating model pipeline...
Splitting data...
Training set: (4000, 10)
Test set: (1000, 10)

Training model...
Making predictions...

MODEL PERFORMANCE METRICS

Training Set:
MAE:  $3,542.50
MSE:  $20,242,790.97
RMSE: $4,499.20
R²:   0.9422

Test Set:
MAE:  $4,824.99
MSE:  $36,035,791.24
RMSE: $6,002.98
R²:   0.8995

Cross-validation scores (5-fold):
R² scores: [0.90440984 0.89729712 0.89166278 0.90031385 0.90575976]
Mean R²: 0.8999 (+/- 0.0102)

Extracting feature information...


AttributeError: 'list' object has no attribute 'tolist'

In [3]:
import pandas as pd
import numpy as np
import joblib
import os
from datetime import datetime
import json

def create_car_price_csv():
    """Create car_price.csv with synthetic data"""
    print("Creating car_price.csv...")
    np.random.seed(42)
    n_samples = 5000
    
    # Car data
    brands_tier1 = ['Tesla', 'Mercedes', 'BMW', 'Audi', 'Porsche']
    brands_tier2 = ['Lexus', 'Volvo', 'Jaguar', 'Land Rover']
    brands_tier3 = ['Toyota', 'Honda', 'Ford', 'Chevrolet', 'Nissan', 'Hyundai', 'Kia']
    all_brands = brands_tier1 + brands_tier2 + brands_tier3
    
    data = {
        'car_id': range(1, n_samples + 1),
        'brand': np.random.choice(all_brands, n_samples),
        'model': np.random.choice(['Sedan', 'SUV', 'Truck', 'Coupe', 'Convertible', 'Hatchback', 'Minivan'], n_samples),
        'year': np.random.randint(2000, 2024, n_samples),
        'mileage': np.random.exponential(50000, n_samples).astype(int),
        'engine_size': np.round(np.random.uniform(1.0, 6.0, n_samples), 1),
        'fuel_type': np.random.choice(['Petrol', 'Diesel', 'Hybrid', 'Electric'], n_samples, p=[0.4, 0.3, 0.2, 0.1]),
        'transmission': np.random.choice(['Automatic', 'Manual', 'CVT'], n_samples, p=[0.7, 0.25, 0.05]),
        'owner_count': np.random.randint(0, 6, n_samples),
        'accident_history': np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
        'service_history': np.random.choice(['Full', 'Partial', 'None'], n_samples, p=[0.6, 0.3, 0.1]),
        'color': np.random.choice(['Black', 'White', 'Silver', 'Gray', 'Red', 'Blue', 'Green'], n_samples),
    }
    
    df = pd.DataFrame(data)
    
    # Calculate realistic prices
    def calculate_price(row):
        price = 20000
        
        # Brand effect
        if row['brand'] in brands_tier1:
            price *= 2.5
        elif row['brand'] in brands_tier2:
            price *= 1.8
        
        # Model effect
        model_mult = {
            'SUV': 1.3, 'Truck': 1.2, 'Coupe': 1.1,
            'Convertible': 1.4, 'Sedan': 1.0,
            'Hatchback': 0.9, 'Minivan': 0.95
        }.get(row['model'], 1.0)
        price *= model_mult
        
        # Year effect
        price += (row['year'] - 2000) * 800
        
        # Mileage effect
        price -= row['mileage'] * 0.08
        
        # Engine size
        price += row['engine_size'] * 1500
        
        # Fuel type
        if row['fuel_type'] == 'Electric':
            price += 15000
        elif row['fuel_type'] == 'Hybrid':
            price += 8000
        elif row['fuel_type'] == 'Diesel':
            price += 3000
        
        # Transmission
        if row['transmission'] == 'Manual':
            price -= 2000
        elif row['transmission'] == 'CVT':
            price += 1000
        
        # Owners
        price -= row['owner_count'] * 1500
        
        # Accidents
        if row['accident_history'] == 1:
            price *= 0.65
        
        # Service history
        if row['service_history'] == 'Partial':
            price *= 0.9
        elif row['service_history'] == 'None':
            price *= 0.7
        
        # Random variation
        price += np.random.normal(0, 5000)
        
        return max(price, 3000)
    
    df['price'] = df.apply(calculate_price, axis=1).round(2)
    
    # Save
    df.to_csv('car_price.csv', index=False)
    print(f"✓ Created car_price.csv with {n_samples} records")
    print(f"  Price range: ${df['price'].min():,.2f} to ${df['price'].max():,.2f}")
    return df

def create_dummy_model_files():
    """Create dummy model files for immediate use"""
    print("\nCreating model files...")
    
    # Create model directory
    os.makedirs('model', exist_ok=True)
    
    # Create a simple preprocessing info structure
    preprocessing_info = {
        'categorical_features': ['brand', 'model', 'fuel_type', 'transmission', 'service_history'],
        'numerical_features': ['year', 'mileage', 'engine_size', 'owner_count', 'accident_history'],
        'all_features': ['brand', 'model', 'year', 'mileage', 'engine_size', 
                        'fuel_type', 'transmission', 'owner_count', 
                        'accident_history', 'service_history'],
        'categorical_categories': {
            'brand': ['Toyota', 'Honda', 'Ford', 'BMW', 'Mercedes', 'Audi', 
                     'Hyundai', 'Tesla', 'Chevrolet', 'Nissan', 'Kia', 
                     'Lexus', 'Volvo', 'Jaguar', 'Land Rover', 'Porsche'],
            'model': ['Sedan', 'SUV', 'Truck', 'Coupe', 'Convertible', 'Hatchback', 'Minivan'],
            'fuel_type': ['Petrol', 'Diesel', 'Hybrid', 'Electric'],
            'transmission': ['Automatic', 'Manual', 'CVT'],
            'service_history': ['Full', 'Partial', 'None']
        },
        'numerical_stats': {
            'year': {'min': 2000, 'max': 2023, 'mean': 2015.5, 'std': 4.5},
            'mileage': {'min': 1000, 'max': 250000, 'mean': 75000, 'std': 40000},
            'engine_size': {'min': 1.0, 'max': 6.0, 'mean': 2.5, 'std': 1.0},
            'owner_count': {'min': 0, 'max': 5, 'mean': 1.5, 'std': 1.2},
            'accident_history': {'min': 0, 'max': 1, 'mean': 0.15, 'std': 0.36}
        },
        'price_stats': {
            'min': 3000,
            'max': 120000,
            'mean': 35000,
            'std': 20000
        },
        'model_metadata': {
            'model_type': 'DummyModel',
            'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'description': 'Placeholder model - run train_model.py to train actual model'
        }
    }
    
    # Save preprocessing info
    joblib.dump(preprocessing_info, 'model/preprocessing.pkl')
    
    # Create a dummy model class
    class DummyCarPriceModel:
        def predict(self, X):
            """Simple linear formula for price prediction"""
            if isinstance(X, pd.DataFrame):
                # Extract features
                predictions = []
                for _, row in X.iterrows():
                    price = 20000
                    
                    # Simple calculations
                    if row['brand'] in ['BMW', 'Mercedes', 'Audi', 'Tesla', 'Porsche']:
                        price *= 1.8
                    elif row['brand'] in ['Lexus', 'Volvo', 'Jaguar', 'Land Rover']:
                        price *= 1.4
                    
                    price += (row['year'] - 2000) * 800
                    price -= row['mileage'] * 0.1
                    price += row['engine_size'] * 2000
                    
                    if row['fuel_type'] == 'Electric':
                        price += 12000
                    elif row['fuel_type'] == 'Hybrid':
                        price += 6000
                    
                    if row['transmission'] == 'Manual':
                        price -= 1500
                    
                    price -= row['owner_count'] * 1000
                    
                    if row['accident_history'] == 1:
                        price *= 0.7
                    
                    predictions.append(price)
                
                return np.array(predictions)
            return np.array([35000])  # Default price
    
    # Create and save dummy model
    dummy_model = {
        'model': DummyCarPriceModel(),
        'preprocessor': None,
        '_is_fitted': True
    }
    
    joblib.dump(dummy_model, 'model/car_price_model.pkl')
    
    print("✓ Created dummy model files")
    print("⚠️  Note: These are placeholder files. Run train_model.py for actual trained model.")
    
    return True

def main():
    """Main function to create all files"""
    print("="*60)
    print("CAR PRICE PREDICTION PROJECT - FILE GENERATOR")
    print("="*60)
    
    # Create CSV file
    df = create_car_price_csv()
    
    # Create model directory
    os.makedirs('model', exist_ok=True)
    
    # Create dummy model files for immediate use
    create_dummy_model_files()
    
    # Create a simple train_model.py that users can run
    create_train_script()
    
    print("\n" + "="*60)
    print("ALL FILES CREATED SUCCESSFULLY!")
    print("="*60)
    print("\nFiles created:")
    print("1. car_price.csv - Dataset with 5,000 car records")
    print("2. model/car_price_model.pkl - Placeholder model file")
    print("3. model/preprocessing.pkl - Feature information")
    print("4. train_model.py - Script to train actual model")
    
    print("\nNext steps:")
    print("1. Install requirements: pip install -r requirements.txt")
    print("2. Train the model: python train_model.py")
    print("3. Run the app: streamlit run app.py")

def create_train_script():
    """Create a train_model.py script"""
    train_script = '''import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
import os

print("Loading data...")
df = pd.read_csv('car_price.csv')
df = df.drop(['car_id', 'color'], axis=1)

X = df.drop('price', axis=1)
y = df['price']

categorical_features = ['brand', 'model', 'fuel_type', 'transmission', 'service_history']
numerical_features = ['year', 'mileage', 'engine_size', 'owner_count', 'accident_history']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training model...")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\\nModel Performance:")
print(f"MAE: ${mae:,.2f}")
print(f"R² Score: {r2:.4f}")

os.makedirs('model', exist_ok=True)
joblib.dump(model, 'model/car_price_model.pkl')

feature_info = {
    'categorical_features': categorical_features,
    'numerical_features': numerical_features,
    'all_features': list(X.columns)
}
joblib.dump(feature_info, 'model/preprocessing.pkl')

print("\\nModel saved successfully!")
'''
    
    with open('train_model.py', 'w') as f:
        f.write(train_script)
    print("✓ Created train_model.py")

if __name__ == "__main__":
    main()

CAR PRICE PREDICTION PROJECT - FILE GENERATOR
Creating car_price.csv...
✓ Created car_price.csv with 5000 records
  Price range: $3,000.00 to $106,048.11

Creating model files...


PicklingError: Can't pickle <class '__main__.create_dummy_model_files.<locals>.DummyCarPriceModel'>: it's not found as __main__.create_dummy_model_files.<locals>.DummyCarPriceModel