****
# **`Exploratory Data Analysis`**

In [None]:
import pandas as pd
dataset = pd.read_csv('Cleaned_data.csv')

In [None]:
dataset.isnull().sum()

In [None]:
dataset.describe()

****
**`Column : Variant Name, Model & Color`**
- This column has 1900+ unique values. So that i am gonna drop that column.

In [None]:
dataset.drop(columns=['Variant Name', 'Color', 'Model'], inplace= True)

In [None]:
categorical_columns = [column for column in dataset.columns if dataset[column].dtype == "object"]
numerical_columns = [column for column in dataset.columns if dataset[column].dtype != "object"]

print("categorical columns:\n", categorical_columns, "\n\n Numerical columns:\n", numerical_columns)

#  **`Initalizing the list to mark the columns to do one hot encoding`**.

In [None]:
columns_to_onehot = []

****
**`Column : Build Type`**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set Seaborn style
sns.set_style("whitegrid")

# Plot count plot for 'build type' column
plt.figure(figsize=(10, 6))
ax = sns.countplot(data=dataset, x='Build Type', palette='Set2')

# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 5), 
                textcoords = 'offset points')

plt.title('Distribution of Build Type')
plt.xlabel('Build Type')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
# Values to drop from 'build type' column
buildtype_to_drop = ['Coupe', 'Wagon', 'Hybrids', 'Convertibles', 'Pickup Trucks', 'Minivans']

# Drop rows where 'build type' is in the list of values to drop
dataset = dataset[~dataset['Build Type'].isin(buildtype_to_drop)]

# Set Seaborn style
sns.set_style("whitegrid")

# Plot count plot for 'build type' column
plt.figure(figsize=(10, 6))
ax = sns.countplot(data=dataset, x='Build Type', palette='Set2')

# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 5), 
                textcoords = 'offset points')

plt.title('Distribution of Build Type')
plt.xlabel('Build Type')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
# Calculate the mean price for each category of build type
mean_price_by_build_type = dataset.groupby('Build Type')['price'].mean()

# Display the result
print(mean_price_by_build_type)

- About to encode Build Type based on the Average Price of the car.
- Average of some build types are close to each other.
- Im going to mark the Build type for 'One Hot Encoding'.

In [None]:
columns_to_onehot.append('Build Type')

****
**`Column : OEM`**

In [None]:
# Set Seaborn style
sns.set_style("whitegrid")

# Plot count plot for 'OEM' column
plt.figure(figsize=(20, 6))
ax = sns.countplot(data=dataset, x='OEM', palette='Set2')

# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 5), 
                textcoords = 'offset points')

plt.title('Distribution of OEM')
plt.xlabel('OEM')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

- Gonna Take only OEM having count more than 50 to reduce the dimensionality.

In [None]:
# Calculate the count of each OEM
oem_counts = dataset['OEM'].value_counts()

# Filter out OEMs with count more than 5
oems_to_consider = oem_counts[oem_counts > 50].index.tolist()

# Filter the DataFrame to include only the selected OEMs
OEM_filtered_dataset = dataset[dataset['OEM'].isin(oems_to_consider)]

# Plot count plot for 'OEM' column
plt.figure(figsize=(20, 6))
ax = sns.countplot(data=OEM_filtered_dataset, x='OEM', palette='Set2')

# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 5), 
                textcoords = 'offset points')

plt.title('Distribution of OEM')
plt.xlabel('OEM')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
columns_to_onehot.append('OEM')

****
**`Column : Insurance Validity`**

In [None]:
# Set Seaborn style
sns.set_style("whitegrid")

# Plot count plot for 'OEM' column
plt.figure(figsize=(20, 6))
ax = sns.countplot(data=OEM_filtered_dataset, x='Insurance Validity', palette='Set2')

# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 5), 
                textcoords = 'offset points')

plt.title('Distribution of Insurance Validity')
plt.xlabel('Insurance Validity')
plt.ylabel('Count')
plt.xticks()  # Rotate x-axis labels for better readability
plt.show()

In [None]:
columns_to_onehot.append('Insurance Validity')

****
**`COlumn : Fuel Type`**

In [None]:
# Set Seaborn style
sns.set_style("whitegrid")

# Plot count plot for 'OEM' column
plt.figure(figsize=(20, 6))
ax = sns.countplot(data=OEM_filtered_dataset, x='Fuel Type', palette='Set2')

# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 5), 
                textcoords = 'offset points')

plt.title('Distribution of Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.xticks()  # Rotate x-axis labels for better readability
plt.show()

In [None]:
Fuel_Type_filtered_dataset = OEM_filtered_dataset[OEM_filtered_dataset['Fuel Type'].isin(['Petrol', 'Diesel'])]

In [None]:
sns.set_style("whitegrid")

# Plot count plot for 'Fuel Type' column
plt.figure(figsize=(20, 6))
ax = sns.countplot(data=Fuel_Type_filtered_dataset, x='Fuel Type', palette='Set2')

# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 5), 
                textcoords = 'offset points')

plt.title('Distribution of Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.xticks()  # Rotate x-axis labels for better readability
plt.show()

In [None]:
# Binary encode the 'Fuel Type' column
Fuel_Type_filtered_dataset['Fuel Type'] = Fuel_Type_filtered_dataset['Fuel Type'].map({'Petrol': 0, 'Diesel': 1})

****
**`COlumn : Transmission`**

In [None]:
sns.set_style("whitegrid")

# Plot count plot for 'Transmission' column
plt.figure(figsize=(20, 6))
ax = sns.countplot(data=Fuel_Type_filtered_dataset, x='Transmission', palette='Set2')

# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 5), 
                textcoords = 'offset points')

plt.title('Distribution of Transmission')
plt.xlabel('Transmission')
plt.ylabel('Count')
plt.xticks()  # Rotate x-axis labels for better readability
plt.show()

In [None]:
# Binary encode the 'Transmission' column
Fuel_Type_filtered_dataset['Transmission'] = Fuel_Type_filtered_dataset['Transmission'].map({'Manual': 0, 'Automatic': 1})

****
# **`Numerical Columns`**

- **`Kilo Meter`**

In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'Kilo Meter' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Fuel_Type_filtered_dataset['Kilo Meter'], color='skyblue', shade=True)
plt.title('Distribution of Kilo Meter')
plt.xlabel('Kilo Meter')
plt.ylabel('Density')
plt.grid(True)
plt.show()

In [None]:
# Calculate the first quartile (Q1)
Q1 = Fuel_Type_filtered_dataset['Kilo Meter'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = Fuel_Type_filtered_dataset['Kilo Meter'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower bound (Q1 - 1.5 * IQR)
lower_bound = Q1 - 1.5 * IQR

# Define the upper bound (Q3 + 1.5 * IQR)
upper_bound = Q3 + 1.5 * IQR

# Drop rows with 'Kilo Meter' values outside the lower and upper bounds
Km_filtered_dataset = Fuel_Type_filtered_dataset[(Fuel_Type_filtered_dataset['Kilo Meter'] >= lower_bound) & (Fuel_Type_filtered_dataset['Kilo Meter'] <= upper_bound)]


In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'Kilo Meter' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Km_filtered_dataset['Kilo Meter'], color='skyblue', shade=True)
plt.title('Distribution of Kilo Meter')
plt.xlabel('Kilo Meter')
plt.ylabel('Density')
plt.grid(True)
plt.show()


In [None]:
# Drop rows where 'Kilo Meter' column is 0
Km_filtered_dataset = Km_filtered_dataset[Km_filtered_dataset['Kilo Meter'] != 0]

- **Transforming to Normal Distribution.**

In [None]:
import numpy as np
from scipy import stats

# Apply Box-Cox transformation
transformed_data, Kilo_Meter_lamda = stats.boxcox(Km_filtered_dataset['Kilo Meter'])
# Assigning the values
Km_filtered_dataset['Kilo Meter'] = transformed_data

# Open the file in write mode
with open('parameters.txt', 'a') as file:
    # Write the variable name and its corresponding lambda value
    file.write("Kilo_Meter_lamda = {}\n".format(Kilo_Meter_lamda))

In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'Kilo Meter' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Km_filtered_dataset['Kilo Meter'], color='skyblue', shade=True)
plt.title('Distribution of Kilo Meter')
plt.xlabel('Kilo Meter')
plt.ylabel('Density')
plt.grid(True)
plt.show()

- The Box-Cox transformation is a widely used method to stabilize the variance and make the data more closely approximate a normal distribution. It is defined as:
- X(λ)={Xλ−1λ for λ≠0,
    logX for λ=0}

****
**`ownerNo`**

In [None]:
# Calculate frequency distribution of 'owner' categories
owner_counts = Km_filtered_dataset['ownerNo'].value_counts()

# Visualize using a bar plot
plt.figure(figsize=(8, 6))
owner_counts.plot(kind='bar', color='skyblue')
plt.title('Frequency Distribution of Ownership Types')
plt.xlabel('Owner Type')
plt.ylabel('Frequency')
plt.xticks(rotation=0)  # Rotate x-axis labels if needed
plt.grid(axis='y')
plt.show()

****
**`modelYear`**

In [None]:
# Plotting the distribution of model years using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=Km_filtered_dataset['modelYear'].value_counts().index,
            y=Km_filtered_dataset['modelYear'].value_counts().values,
            color='skyblue')  # Specify a valid color here
plt.title('Distribution of Model Year')
plt.xlabel('Model Year')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  
plt.show()


In [None]:
# Calculate the first quartile (Q1)
Q1 = Km_filtered_dataset['modelYear'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = Km_filtered_dataset['modelYear'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower bound (Q1 - 1.5 * IQR)
lower_bound = Q1 - 1.5 * IQR

# Define the upper bound (Q3 + 1.5 * IQR)
upper_bound = Q3 + 1.5 * IQR

# Drop rows with 'modelYear' values outside the lower and upper bounds
Km_filtered_dataset = Km_filtered_dataset[(Km_filtered_dataset['modelYear'] >= lower_bound) & (Km_filtered_dataset['modelYear'] <= upper_bound)]


In [None]:
# Plotting the distribution of model years using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=Km_filtered_dataset['modelYear'].value_counts().index,
            y=Km_filtered_dataset['modelYear'].value_counts().values,
            color='skyblue')  # Specify a valid color here
plt.title('Distribution of Model Year')
plt.xlabel('Model Year')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()


****
**`Column : CentralVariantID`**

In [None]:
len(Km_filtered_dataset['centralVariantId'].unique())

In [None]:
Km_filtered_dataset['centralVariantId'].value_counts()

- VariantId may be different for the same model.So i am going to leave it.

In [None]:
Km_filtered_dataset.drop(columns = ['centralVariantId'], inplace=True)

****
**`Column : price`**
- This column is my target feature. I am going to find its correlation with others 

In [None]:
Km_filtered_dataset['price'].value_counts()

In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'price' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Km_filtered_dataset['price'], color='skyblue', shade=True)
plt.title('Distribution of price')
plt.xlabel('price')
plt.ylabel('Density')
plt.grid(True)
plt.show()

In [None]:
# Compute summary statistics
price_summary = Km_filtered_dataset['price'].describe()
price_summary

In [None]:
numeric_columns = Km_filtered_dataset.select_dtypes(include='number')
correlation_matrix = numeric_columns.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

- **Corelation B/W No Of Doors and price is 0.00**
- So i am going to leave that feature.

In [None]:
Km_filtered_dataset.drop(columns=['No Of Doors'], inplace=True)

- **Removing Outliers**

In [None]:
# Calculate the first quartile (Q1)
Q1 = Km_filtered_dataset['price'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = Km_filtered_dataset['price'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower bound (Q1 - 1.5 * IQR)
lower_bound = Q1 - 1.5 * IQR

# Define the upper bound (Q3 + 1.5 * IQR)
upper_bound = Q3 + 1.5 * IQR

# Drop rows with 'price' values outside the lower and upper bounds
Km_filtered_dataset = Km_filtered_dataset[(Km_filtered_dataset['price'] >= lower_bound) & (Km_filtered_dataset['price'] <= upper_bound)]


In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'price' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Km_filtered_dataset['price'], color='blue', shade=True)
plt.title('Distribution of price')
plt.xlabel('price')
plt.ylabel('Density')
plt.grid(True)
plt.show()

In [None]:
# Apply Box-Cox transformation
transformed_data, price_lamda = stats.boxcox(Km_filtered_dataset['price'])
# Assigning the values
Km_filtered_dataset['price'] = transformed_data

# Open the file in write mode
with open('parameters.txt', 'a') as file:
    # Write the variable name and its corresponding lambda value
    file.write("price_lamda = {}\n".format(price_lamda))

In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'price' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Km_filtered_dataset['price'], color='red', shade=True)
plt.title('Distribution of price')
plt.xlabel('price')
plt.ylabel('Density')
plt.grid(True)
plt.show()

****
**`Column : Registration Year`**

In [None]:
# Plotting the distribution of Registration years using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=Km_filtered_dataset['Registration Year'].value_counts().index,
            y=Km_filtered_dataset['Registration Year'].value_counts().values,
            color='skyblue')  # Specify a valid color here
plt.title('Distribution of Registration Year')
plt.xlabel('Registration Year')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  
plt.show()


****
**`Column : Seats`**

In [None]:
Km_filtered_dataset['Seats'].value_counts()

- **There is only oneCar with 10 Seats** I am going to remove that.

In [None]:
# Drop rows where 'Seats' column has a value of 10
Km_filtered_dataset = Km_filtered_dataset[Km_filtered_dataset['Seats'] != 10]

In [None]:
# Get the value counts of 'Seats' column and sort by index to get the correct order
seat_value_counts = Km_filtered_dataset['Seats'].value_counts().sort_index()

# Calculate the maximum frequency value
max_freq = seat_value_counts.max()

plt.figure(figsize=(15, 6))
ax = sns.countplot(x=Km_filtered_dataset['Seats'], color='red', order=seat_value_counts.index)
plt.title('Distribution of Seats')
plt.xlabel('Seats')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# Add frequency values on top of the bars
for i, v in enumerate(seat_value_counts.values):
    ax.text(i, v + max_freq * 0.01, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()


****
**`Column : Engine Displacement`**

In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'Engine Displacement' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Km_filtered_dataset['Engine Displacement'], color='skyblue', shade=True)
plt.title('Distribution of Engine Displacement')
plt.xlabel('Engine Displacement')
plt.ylabel('Density')
plt.grid(True)
plt.show()

- **Removing Outliers**

In [None]:
# Calculate the first quartile (Q1)
Q1 = Km_filtered_dataset['Engine Displacement'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = Km_filtered_dataset['Engine Displacement'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower bound (Q1 - 1.5 * IQR)
lower_bound = Q1 - 1.5 * IQR

# Define the upper bound (Q3 + 1.5 * IQR)
upper_bound = Q3 + 1.5 * IQR

# Drop rows with 'Engine Displacement' values outside the lower and upper bounds
Km_filtered_dataset = Km_filtered_dataset[(Km_filtered_dataset['Engine Displacement'] >= lower_bound) & (Km_filtered_dataset['Engine Displacement'] <= upper_bound)]


In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'Engine Displacement' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Km_filtered_dataset['Engine Displacement'], color='red', shade=True)
plt.title('Distribution of Engine Displacement')
plt.xlabel('Engine Displacement')
plt.ylabel('Density')
plt.grid(True)
plt.show()

****
**`Column : Year of manufacture`**

In [None]:
# Get the value counts of 'Year of Manufacture' column and sort by index to get the correct order
seat_value_counts = Km_filtered_dataset['Year of Manufacture'].value_counts().sort_index()

# Calculate the maximum frequency value
max_freq = seat_value_counts.max()

plt.figure(figsize=(15, 6))
ax = sns.countplot(x=Km_filtered_dataset['Year of Manufacture'], color='Green', order=seat_value_counts.index)
plt.title('Distribution of Year of Manufacture')
plt.xlabel('Year of Manufacture')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# Add frequency values on top of the bars
for i, v in enumerate(seat_value_counts.values):
    ax.text(i, v + max_freq * 0.01, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [None]:
# Line plot (optional, if 'Year of Manufacture' is continuous)
plt.figure(figsize=(10, 6))
sns.lineplot(data=Km_filtered_dataset, x='Year of Manufacture', y='price')
plt.title('Line Plot of price vs Year of Manufacture')
plt.xlabel('Year of Manufacture')
plt.ylabel('price')
plt.show()

# Box plot (optional, if 'Year of Manufacture' is discrete)
plt.figure(figsize=(10, 6))
sns.boxplot(data=Km_filtered_dataset, x='Year of Manufacture', y='price')
plt.title('Box Plot of price by Year of Manufacture')
plt.xlabel('Year of Manufacture')
plt.ylabel('price')
plt.show()

In [None]:
# Group the data by 'Year of Manufacture'
grouped_year = Km_filtered_dataset.groupby('Year of Manufacture')

# Function to remove outliers from each group
def remove_outliers(group):
    Q1 = group['price'].quantile(0.25)
    Q3 = group['price'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return group[(group['price'] >= lower_bound) & (group['price'] <= upper_bound)]

# Apply the function to each group and concatenate the results
cleaned_data = grouped_year.apply(remove_outliers)

# Reset index
cleaned_data.reset_index(drop=True, inplace=True)


In [None]:

plt.figure(figsize=(10, 6))
sns.boxplot(data=cleaned_data, x='Year of Manufacture', y='price')
plt.title('Box Plot of price by Year of Manufacture')
plt.xlabel('Year of Manufacture')
plt.ylabel('price')
plt.show()

****
**`Column : Mileage`**

In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'Mileage' column
plt.figure(figsize=(8, 6))
sns.kdeplot(cleaned_data['Mileage'], color='skyblue', shade=True)
plt.title('Distribution of Mileage')
plt.xlabel('Mileage')
plt.ylabel('Density')
plt.grid(True)
plt.show()

In [None]:
# Calculate the first quartile (Q1)
Q1 = cleaned_data['Mileage'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = cleaned_data['Mileage'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower bound (Q1 - 1.5 * IQR)
lower_bound = Q1 - 1.5 * IQR

# Define the upper bound (Q3 + 1.5 * IQR)
upper_bound = Q3 + 1.5 * IQR

# Drop rows with 'Mileage' values outside the lower and upper bounds
cleaned_data = cleaned_data[(cleaned_data['Mileage'] >= lower_bound) & (cleaned_data['Mileage'] <= upper_bound)]


In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'Mileage' column
plt.figure(figsize=(8, 6))
sns.kdeplot(cleaned_data['Mileage'], color='red', shade=True)
plt.title('Distribution of Mileage')
plt.xlabel('Mileage')
plt.ylabel('Density')
plt.grid(True)
plt.show()

****
**`COlumn : Displacement`**
- Value of Column Displacement is already covered by Engine Displacement.

In [None]:
cleaned_data.drop(columns = ['Displacement'], inplace=True)

****
**`Column : Engine`**

In [None]:
# Get the value counts of 'Engine' column and sort by index to get the correct order
seat_value_counts = cleaned_data['Engine'].value_counts().sort_index()

# Calculate the maximum frequency value
max_freq = seat_value_counts.max()

plt.figure(figsize=(15, 6))
ax = sns.countplot(x=cleaned_data['Engine'], color='Green', order=seat_value_counts.index)
plt.title('Distribution of Engine')
plt.xlabel('Engine')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# Add frequency values on top of the bars
for i, v in enumerate(seat_value_counts.values):
    ax.text(i, v + max_freq * 0.01, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [None]:
# Calculate the first quartile (Q1)
Q1 = cleaned_data['Engine'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = cleaned_data['Engine'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower bound (Q1 - 1.5 * IQR)
lower_bound = Q1 - 1.5 * IQR

# Define the upper bound (Q3 + 1.5 * IQR)
upper_bound = Q3 + 1.5 * IQR

# Drop rows with 'Engine' values outside the lower and upper bounds
cleaned_data = cleaned_data[(cleaned_data['Engine'] >= lower_bound) & (cleaned_data['Engine'] <= upper_bound)]


In [None]:
# Get the value counts of 'Engine' column and sort by index to get the correct order
seat_value_counts = cleaned_data['Engine'].value_counts().sort_index()

# Calculate the maximum frequency value
max_freq = seat_value_counts.max()

plt.figure(figsize=(15, 6))
ax = sns.countplot(x=cleaned_data['Engine'], color='Green', order=seat_value_counts.index)
plt.title('Distribution of Engine')
plt.xlabel('Engine')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# Add frequency values on top of the bars
for i, v in enumerate(seat_value_counts.values):
    ax.text(i, v + max_freq * 0.01, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Count occurrences of each unique value in 'Engine' column
engine_counts = cleaned_data['Engine'].value_counts()

# Filter rows where the count is greater than or equal to 5
Engine_filtered_data = cleaned_data[cleaned_data['Engine'].isin(engine_counts.index[engine_counts >= 5])]


In [None]:
# Get the value counts of 'Engine' column and sort by index to get the correct order
seat_value_counts = Engine_filtered_data['Engine'].value_counts().sort_index()

# Calculate the maximum frequency value
max_freq = seat_value_counts.max()

plt.figure(figsize=(15, 6))
ax = sns.countplot(x=Engine_filtered_data['Engine'], color='orange', order=seat_value_counts.index)
plt.title('Distribution of Engine')
plt.xlabel('Engine')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# Add frequency values on top of the bars
for i, v in enumerate(seat_value_counts.values):
    ax.text(i, v + max_freq * 0.01, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

****
**`Column : No of Cylinder`**

In [None]:
Engine_filtered_data['No of Cylinder'].value_counts()

In [None]:
# Get the value counts of 'No of Cylinder' column and sort by index to get the correct order
seat_value_counts = Engine_filtered_data['No of Cylinder'].value_counts().sort_index()

# Calculate the maximum frequency value
max_freq = seat_value_counts.max()

plt.figure(figsize=(15, 6))
ax = sns.countplot(x=Engine_filtered_data['No of Cylinder'], color='red', order=seat_value_counts.index)
plt.title('Distribution of Engine')
plt.xlabel('No of Cylinder')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# Add frequency values on top of the bars
for i, v in enumerate(seat_value_counts.values):
    ax.text(i, v + max_freq * 0.01, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

****
**`Column : Valves per cylinder`**

In [None]:
# Rename the column
Engine_filtered_data.rename(columns={'Values per Cylinder': 'Valves per Cylinder'}, inplace=True)

In [None]:
# Get the value counts of 'Valves per Cylinder' column and sort by index to get the correct order
seat_value_counts = Engine_filtered_data['Valves per Cylinder'].value_counts().sort_index()

# Calculate the maximum frequency value
max_freq = seat_value_counts.max()

plt.figure(figsize=(15, 6))
ax = sns.countplot(x=Engine_filtered_data['Valves per Cylinder'], color='Green', order=seat_value_counts.index)
plt.title('Distribution of Engine')
plt.xlabel('Valves per Cylinder')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# Add frequency values on top of the bars
for i, v in enumerate(seat_value_counts.values):
    ax.text(i, v + max_freq * 0.01, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'Max Power(bhp)' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Engine_filtered_data['Max Power(bhp)'], color='skyblue', shade=True)
plt.title('Distribution of Max Power(bhp)')
plt.xlabel('Max Power(bhp)')
plt.ylabel('Density')
plt.grid(True)
plt.show()

In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'Max Torque(Nm)' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Engine_filtered_data['Max Torque(Nm)'], color='skyblue', shade=True)
plt.title('Distribution of Max Torque(Nm)')
plt.xlabel('Max Torque(Nm)')
plt.ylabel('Density')
plt.grid(True)
plt.show()

- **Outlier Removal**

In [None]:
# Calculate the first quartile (Q1)
Q1 = Engine_filtered_data['Max Torque(Nm)'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = Engine_filtered_data['Max Torque(Nm)'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower bound (Q1 - 1.5 * IQR)
lower_bound = Q1 - 1.5 * IQR

# Define the upper bound (Q3 + 1.5 * IQR)
upper_bound = Q3 + 1.5 * IQR

# Drop rows with 'Max Torque(Nm)' values outside the lower and upper bounds
Torque_filtered_data = Engine_filtered_data[(Engine_filtered_data['Max Torque(Nm)'] >= lower_bound) & (Engine_filtered_data['Max Torque(Nm)'] <= upper_bound)]


In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Plot kernel density estimation (KDE) plot for the 'Max Torque(Nm)' column
plt.figure(figsize=(8, 6))
sns.kdeplot(Torque_filtered_data['Max Torque(Nm)'], color='red', shade=True)
plt.title('Distribution of Max Torque(Nm)')
plt.xlabel('Max Torque(Nm)')
plt.ylabel('Density')
plt.grid(True)
plt.show()

In [None]:
Torque_filtered_data.to_csv('data_for_model.csv',index = False)

In [None]:
Torque_filtered_data.columns