In [350]:
# Step 1: Import required libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings


warnings.filterwarnings("ignore")



In [None]:
data = pd.read_csv(r'C:\Users\ayush\OneDrive\Desktop\new\used_car_dataset_large.csv')
print(data.head())


In [None]:

print(data.info())

print(data.isnull().sum())



numeric_data = data.select_dtypes(include=[np.number])


plt.figure(figsize=(10, 6))
sns.heatmap(numeric_data.corr(), annot=True, cmap='pink', fmt='.2f')
plt.show()


In [None]:

numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())


print(data.isnull().sum())


In [None]:

data.drop_duplicates(inplace=True)


print(data.shape) 


In [355]:
current_year = 2025
if 'year_of_manufacture' in data.columns:
    data['car_age'] = current_year - data['year_of_manufacture']


In [None]:

plt.figure(figsize=(10, 6))
sns.histplot(data['Price'], kde=True, color='blue') 
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Calculating the car age
current_year = 2024  # or use pd.Timestamp.now().year
data['car_age'] = current_year - data['Year']

# Visualize the relationship between car age and price
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['car_age'], y=data['Price'])
plt.title('Price vs Car Age')
plt.xlabel('Car Age')
plt.ylabel('Price')
plt.show()


In [358]:

data_for_modeling = data.drop(columns=['Fuel', 'Transmission'], errors='ignore') 


X = data_for_modeling.drop(columns=['Price'])  



y = data_for_modeling['Price']  

In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:

data_encoded = pd.get_dummies(data, drop_first=True)

X = data_encoded.drop(columns=['Price'])
y = data_encoded['Price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the data
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')


In [None]:
# Check the model's coefficients (which features are most influential)
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
})

# Sort by the magnitude of the coefficient
coefficients['Abs_Coefficient'] = coefficients['Coefficient'].abs()
coefficients = coefficients.sort_values(by='Abs_Coefficient', ascending=False)

print(coefficients[['Feature', 'Coefficient']])


In [None]:

new_data = {
    'Year': [2008],
    'Mileage': [2000],
    'Make': ['Toyota'],  
    'Model': ['Camry'],  
    'Transmission': ['manual']
}

new_data_df = pd.DataFrame(new_data)

new_data_encoded = pd.get_dummies(new_data_df)

new_data_encoded = new_data_encoded.reindex(columns=X.columns, fill_value=0)

new_price_prediction = model.predict(new_data_encoded)
print(f"Predicted price for new car data: {new_price_prediction[0]} rupees")
