In [None]:
#                                  Car Data Set Project


import pandas as pd
import numpy as n
import matplotlib.pyplot as plt


# 1 Collect and preprocess the data: Collect data on the mileage and sale prices of used cars. Preprocess the data by cleaning it, handling missing values, and transforming it as necessary.

df=pd.read_csv(r"Downloads\CarDataSet.csv")   # collecting data

# Display the first few rows of the dataset
print(df.head())

# Check basic statistics and data types
print(df.info())

# Cleaning tha data

# Drop duplicates if needed
df = df.drop_duplicates()

# Convert data types if necessary
df['Mileage'] = df['Mileage'].str.replace(r'[^0-9.]', '', regex=True).astype(float)


# Handle missing values (if any)
df.dropna(subset=['Mileage'], inplace=True)


# Data Transformation
# Feature scaling (if needed)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['Mileage_scaled'] = scaler.fit_transform(df[['Mileage']])


#2.Conduct exploratory data analysis (EDA): 
#Conduct EDA to identify patterns and relationships between mileage and sale prices for each brand of the car.

# Group the data by unique car brands
brand_groups = df.groupby('Brand')

# Loop through each brand to conduct EDA and create separate plots
for brand, group in brand_groups:
    # Create a new figure for each brand
    plt.figure(figsize=(3,3))
    
 # Scatter plot of Mileage vs. Price for the current brand
    plt.scatter(group['Mileage'], group['Price'], label=brand, alpha=0.5)
    plt.title(f'{brand} - Mileage vs. Price')
    plt.xlabel('Mileage')
    plt.ylabel('Price')
    plt.legend()
    
# Display the plot for the current brand
plt.show()



#3 Split the data: Split the data into training and testing sets. Use the training set to build the regression model 
# and the testing set to evaluate its performance.

from sklearn.model_selection import train_test_split
# Remove rows with null values
df.dropna(inplace=True)
# Convert the 'Mileage' column to string
df['Mileage'] = df['Mileage'].astype(str)

# Clean and convert the 'Mileage' column
df['Mileage'] = df['Mileage'].str.extract('(\d+\.\d+)').astype(float)


# Define the features (X) and the target variable (y)
X = df[['Mileage', 'Year']]  # Add other relevant features
y = df['Price']

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#4 Choose a regression model: Choose an appropriate regression model to use based on the characteristics of the data and research question. 
#Common types of regression models used for this problem include linear regression, polynomial regression, and multiple regression

from sklearn.linear_model import LinearRegression

# Create a Linear Regression model
linear_reg_model = LinearRegression()

# Fit the model to the training data
linear_reg_model.fit(X_train, y_train)

# Predict car prices on the test data
y_pred_linear = linear_reg_model.predict(X_test)



#5 Evaluate the regression model: Evaluate the performance of the regression model using various metrics, 
# such as R-squared, mean squared error (MSE), or root mean squared error (RMSE)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_linear )
mse = mean_squared_error(y_test, y_pred_linear )
rmse = mean_squared_error(y_test, y_pred_linear , squared=False)
r2 = r2_score(y_test, y_pred_linear )

# Print the evaluation metrics
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared (R2): {r2}")




#6.Interpret the coefficients: Interpret the coefficients of the regression model to determine if mileage is a good predictor of sale prices for each brand of car, and if the slopes and intercepts differ between the brands.


# Get the coefficients and intercept
intercept = linear_reg_model.intercept_
coefficient = linear_reg_model.coef_[0]

# Display the overall coefficients
print(f"Intercept: {intercept}")
print(f"Coefficient (Mileage): {coefficient}")

# Group the data by unique car brands
brand_groups = df.groupby('Name')

# Loop through each brand to analyze coefficients
for brand, group in brand_groups:
    X_brand = group[['Mileage']]  # Features for this brand
    y_brand = group['Price']      # Target variable for this brand
    
    # Create and fit a Linear Regression model for this brand
    model_brand = LinearRegression()
    model_brand.fit(X_brand, y_brand)
    
    # Get the coefficients and intercept for this brand
    intercept_brand = model_brand.intercept_
    coefficient_brand = model_brand.coef_[0]
    
    # Display brand-specific coefficients
    print(f"\nBrand: {brand}")
    print(f"Intercept: {intercept_brand}")
    
    
    
    
#7. Make predictions: Once the regression model is trained and evaluated, use it to make predictions on new data. 
#These predictions can be used to estimate the sale prices of luxury cars based on their mileage.


# Step 1: Create a DataFrame with mileage values for luxury cars
luxury_cars_mileage = pd.DataFrame(X_train)  # Replace with your mileage values

# Step 2: Use the trained regression model to make predictions
luxury_cars_predictions = linear_reg_model.predict(luxury_cars_mileage)

# Create a new DataFrame to store the predictions along with mileage values
luxury_cars_predictions_df = pd.DataFrame({'Mileage': luxury_cars_mileage['Mileage'], 'Predicted_Price': luxury_cars_predictions})

# Print the predicted prices for luxury cars
print(luxury_cars_predictions_df)


# 8. Provide recommendations: Based on the analysis, provide recommendations to the used car dealership on how to best price their cars to achieve their business objectives.


# Loop through each brand to analyze coefficients
for brand, group in brand_groups:
    X_brand = group[['Mileage']]  # Features for this brand
    y_brand = group['Price']      # Target variable for this brand
    
    # Create and fit a Linear Regression model for this brand
    model_brand = LinearRegression()
    model_brand.fit(X_brand, y_brand)
    
    # Get the coefficients and intercept for this brand
    intercept_brand = model_brand.intercept_
    coefficient_brand = model_brand.coef_[0]
    
    # Display brand-specific coefficients and recommendations
    print(f"\nBrand: {brand}")
    print(f"Intercept: {intercept_brand}")
    print(f"Coefficient (Mileage): {coefficient_brand}")
    
    # Provide recommendations based on coefficient analysis
    if coefficient_brand < 0 and coefficient_brand < -0.05:
        print("Recommend pricing older cars with higher mileage lower.")
    elif coefficient_brand > -0.05 and coefficient_brand < 0.05:
        print("Mileage may not be a significant factor in pricing for this brand.")
    else:
        print("Consider pricing based on mileage, but also consider market demand and competition.")







