# Main Objectives on this Project:
-Predicting car prices using regression analysis involves employing various methods such as Linear Regression.

-Classify to the Category of cars using classfication methods.

# Dataset Description:
Here's a brief description for each of the attributes in this dataset:

1- ID: An identifier for each entry in the dataset. It's typically a unique value assigned to each record.

2- Price (Target Column): The price of the car, which is the target variable you want to predict in a machine learning context.

3- Levy: The amount of tax or fee imposed on the car. It could be associated with the registration or ownership of the vehicle.

4- Manufacturer: The company or brand that produced the car.

5- Model: The specific model name or designation of the car produced by the manufacturer.

6- Category: The category or type of the car, which could include designations like sedan, SUV, coupe, etc.

7- Leather interior: A binary indicator (yes/no) specifying whether the car has a leather interior.

8- Fuel type: The type of fuel the car uses, such as gasoline, diesel, hybrid, electric, etc.

9- Engine volume: The size of the car's engine, usually measured in liters or cubic centimeters.

10- Mileage: The total distance the car has traveled, often measured in kilometers or miles.

11- Cylinders: The number of cylinders in the car's engine.

12- Gear box type: The type of transmission the car has, such as automatic, manual, or semi-automatic.

13- Drive wheels: The configuration of wheels responsible for powering the car (e.g., front-wheel drive, rear-wheel drive, all-wheel drive).

14- Doors: The number of doors on the car.

15- Wheel: The type of wheel configuration, such as left-hand drive (LHD) or right-hand drive (RHD).

16- Color: The exterior color of the car.

17- Airbags: The number of airbags in the car, which are safety features designed to protect occupants in the event of a collision.

18- Prod.year: The year in which the car was manufactured or produced.

# Import libraries:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf
import seaborn as sn

# Read Dataset:
Load the data and store it in dataframe df

In [None]:
df=pd.read_csv("/kaggle/input/car-price-prediction-challenge/car_price_prediction.csv")

# Explore data:

In [None]:
df.head(20)

In [None]:
print("Shape:", df.shape)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.nunique()

# create a copy of a DataFrame to perform modifications without affecting the original DataFrame.

In [None]:
df1 = df.copy()

# Preprocessing:
Remove unnecessary Columns and Drop Duplicates

In [None]:
df1 = df1.drop(["ID", "Model"], axis="columns")

In [None]:
df1.head()

# Drop Duplicates colums

In [None]:
df1.duplicated().sum()

In [None]:
df1.drop_duplicates(inplace=True)
print("Shape:", df1.shape)

# count the number of missing values (NaN) in each column of the DataFrame df1

In [None]:
df1.isnull().sum()

# Show the unique values in Levy

In [None]:
df1['Levy'].unique()[:20]

In [None]:
df1['Levy'] = df1['Levy'].replace("-", 0)

# Show the unique values in Doors

In [None]:
df1['Doors'].unique()

In [None]:
df1['Doors'] = df1['Doors'].replace(">5", np.nan)

In [None]:
df1['Doors'] = df1['Doors'].apply(lambda x: np.nan if isinstance(x, float) else x)
df1['Doors'] = df1['Doors'].apply(lambda x: int(x.split("-")[0]) if isinstance(x, str) else x)
unique_values = df1['Doors'].unique()
print(unique_values)

In [None]:
df1['Doors'] = df1['Doors'].apply(lambda x: 1 if x == 4 else 0)

# Show the unique values in Engine volume

In [None]:
df1['Engine volume'].unique()[:20]

In [None]:
df1['Engine volume'] = df1['Engine volume'].apply(lambda x: x.split(" ")[0])
df1['Engine volume'].unique()[:20]

# Show the unique values in Mileage

In [None]:
df1['Mileage'].unique()[:20]

In [None]:
df1['Mileage'] = df1['Mileage'].str.replace("km", "")
print("Unique Mileage value:", df1['Mileage'][:5])

# Show the unique values in Wheel

In [None]:
df1['Wheel'].unique()

In [None]:
df1['Wheel'] = df1['Wheel'].apply(lambda x: 1 if x == "Left wheel" else 0)
df1['Wheel'].unique()

# Show the unique values in Drive Wheels

In [None]:
df1['Drive wheels'].unique()

In [None]:
df1['Drive wheels'] = df1['Drive wheels'].str.replace("4x4", "All")
df1['Drive wheels'].unique()

# Show the unique values in  Leather interior

In [None]:
df1['Leather interior'].unique()

In [None]:
df1['Leather interior']=df1['Leather interior'].apply(lambda x: 1 if x == "Yes" else 0)
df1['Leather interior'].unique()

In [None]:
df1.head(10)

# Convert the data types

In [None]:
df1['Levy'] = df1['Levy'].apply(lambda x: int(x))
df1['Engine volume'] = df1['Engine volume'].apply(lambda x: float(x))
df1['Mileage'] = df1['Mileage'].apply(lambda x: int(x))
df1['Cylinders'] = df1['Cylinders'].apply(lambda x: int(x))

In [None]:
df1.dtypes

# Convert Unnecessary Manufacturer to Others

In [None]:
 manufacturer_cars = df1['Manufacturer'].value_counts()

In [None]:
manufacturer_cars_less_than_100 = manufacturer_cars[manufacturer_cars < 200]
print("Total Manufacturer:", len(manufacturer_cars_less_than_100))
df1['Manufacturer'] = df1['Manufacturer'].apply(lambda x: "OTHERS" if x in manufacturer_cars_less_than_100 else x)
df1['Manufacturer'].value_counts()

#  Create a new car age column

In [None]:
current_year = datetime.now().year
print("Current Year:", current_year)

df1['Age'] = current_year -  df1['Prod. year']
df1.head()

In [None]:
df1 = df1.drop('Age', axis="columns")
df1.head()

# View outliers using boxplot

In [None]:
# Create box plots for each numerical column
plt.figure(figsize=(12, 6))
sns.boxplot(data=df1)
plt.title('Box Plots for Each Column')
plt.show()


In [None]:
# Create box plots for all columns
box_plot = df1.iplot(kind='box', title='Box Plots for Each Column', xTitle='Columns', yTitle='Values')
box_plot


In [None]:
# Create box plots for Price
box_plot = df1['Price'].iplot(kind='box', title='Box Plots for Price', xTitle='Price', yTitle='Values')
box_plot

# Remove Extreme price value

In [None]:
price_condition = df1['Price'] > 400000
result_indices_price = df1.loc[price_condition].index
df1.drop(result_indices_price , inplace=True)

In [None]:
# Remove price outliers
price_min_threshold, price_max_threshold = df1['Price'].quantile([0.15, 0.974])
df2 = df1[(df1['Price'] > price_min_threshold) & (df1['Price'] < price_max_threshold)]
display(df2.sort_values(by="Price", ascending=True))
print("df1 Rows:", df1.shape[0])
print("Total Rows Removed:", df1.shape[0] - df2.shape[0])

In [None]:
box_plot = df2['Price'].iplot(kind='box',
                          title='',
                          yTitle='',
                          asFigure=True)

# Show the plot
box_plot.show()

In [None]:
box_plot = df2.iplot(kind='box',
                          title='',
                          yTitle='',
                          asFigure=True)

# Show the plot
box_plot.show()

In [None]:
box_plot = df1['Engine volume'].iplot(kind='box',
                          title='',
                          yTitle='',
                          asFigure=True)

# Show the plot
box_plot.show()

In [None]:
# Remove Extreme Engine Volume Value
engine_volume_condition = df1['Engine volume'] > 10.0
index_number = df1.loc[engine_volume_condition].index
df1.drop(index_number, inplace=True)

In [None]:
# Remove Engine volume outliers
ev_min_threshold, ev_max_threshold = df1['Engine volume'].quantile([0.007, 0.99])
df2 = df1[(df1['Engine volume'] > ev_min_threshold) & (df1['Engine volume'] < ev_max_threshold)]
display(df2.sort_values(by="Engine volume", ascending=True))
print("df1 Rows:", df1.shape[0])
print("Total Rows Removed:", df1.shape[0] - df2.shape[0])

In [None]:
box_plot = df2['Engine volume'].iplot(kind='box',
                          title='',
                          yTitle='',
                          asFigure=True)

# Show the plot
box_plot.show()

In [None]:
box_plot = df1['Mileage'].iplot(kind='box',
                          title='',
                          yTitle='',
                          asFigure=True)

# Show the plot
box_plot.show()

In [None]:
# Remove Extreme Mileage Value
mileage_condition = df1['Mileage'] > 1000000
index_number = df1.loc[mileage_condition].index
df1.drop(index_number, inplace=True)

In [None]:
# Remove Mileage outliers
Mileage_max_threshold = df1['Mileage'].quantile(0.975)
df2 = df1[(df1['Mileage'] < Mileage_max_threshold)]
display(df2.sort_values(by="Mileage", ascending=False).head())
print("df1 Rows:", df1.shape[0])
print("Total Rows Removed:", df1.shape[0] - df2.shape[0])

In [None]:
box_plot = df2['Mileage'].iplot(kind='box',
                          title='',
                          yTitle='',
                          asFigure=True)

# Show the plot
box_plot.show()

In [None]:
df2.head()


In [None]:
manufacturer = df2.groupby('Manufacturer')
manufacturer['Manufacturer'].value_counts().sort_values(ascending=False)

In [None]:
df2.describe()


In [None]:
correlation = df2.select_dtypes(exclude="object").corr()
correlation['Price']

In [None]:
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.show()

In [None]:
df2 = df2.drop(['Cylinders', 'Airbags', 'Levy'], axis="columns")
df2.head()

# Visualization

In [None]:
manufacturer_counts = df2['Manufacturer'].value_counts()
count_plot = manufacturer_counts.iplot(kind='bar', title='Count of Each Manufacturer', xTitle='Manufacturer', yTitle='Count')
count_plot


In [None]:
category_counts = df2['Category'].value_counts()
count_plot = category_counts.iplot(kind='bar', title='Count of Each Category', xTitle='Category', yTitle='Count')

count_plot


In [None]:
columns = ['Color', 'Drive wheels', 'Gear box type', 'Fuel type', 'Leather interior']
for column in columns:
    column_counts = df2[column].value_counts()
    count_plot = column_counts.iplot(kind='bar', title=f'Count of {column}', xTitle=column, yTitle='Count')

    count_plot


In [None]:
manufacturer_counts = df2['Manufacturer'].value_counts()
plt.figure(figsize=(12, 12))
sns.set(style="dark")

manufacturer_counts.plot.pie(autopct='%1.1f%%', wedgeprops={'edgecolor': 'black'})
plt.title('Market Share by Manufacturer.')
plt.ylabel('')  
plt.show()

In [None]:
for column in df2.columns:
    if df2[column].dtype in ['int64', 'float64']:
        scatter_plot = df2[column].iplot(kind='scatter', mode='markers', title=f'{column} Scatter Plot', 
                                         xTitle='Index', yTitle=column, asFigure=True)
        
        scatter_plot.show()


In [None]:
for column in df2.columns:
    if df2[column].dtype in ['int64', 'float64']:
        histogram = df2[column].iplot(kind='hist', bins=30, title=f'{column} Histogram', 
                                      xTitle=column, yTitle='Frequency', asFigure=True)
        
        histogram.show()
