In [None]:
import pandas as pd
df = pd.read_csv('car_price_dataset.csv') 
df.columns = df.columns.str.strip()

df.head()

#DataSet first view

In [None]:
df.info
df.isnull().sum()
#Checking if there is any null value to be handled

In [None]:
brand_counts = df["Brand"].value_counts()
brand_counts_df = brand_counts.reset_index()
brand_counts_df.columns = ["Brand", "Amount"]
print(brand_counts_df)
#checking how many brands and their quantity

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


plt.figure(figsize=(10,6))
sns.histplot(df["Price"], kde=True, color="yellow")
plt.title("Car price distribution")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

sns.boxplot(x=df["Price"], color="red")
plt.title("Distribuição do preço com outliers")
plt.show()

In [None]:
from datetime import datetime

current_year = datetime.now().year

df['Age'] = current_year - df['Year']

df.head()
#creating the "age" column for cars

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x = "Brand", y = "Price", data=df)

plt.xticks(rotation=90)
plt.xlabel("Brand")
plt.ylabel("Price")
plt.title("Price Distribution by Brand")

**Even with some values ​​outside the average, it is clear that the majority remain within the same range. Let's investigate what generates the changes.**

In [None]:
import seaborn as sns

plt.figure(figsize=(12,6))
sns.boxplot(x = "Age", y = "Price", data=df)

plt.xticks(rotation=90)
plt.xlabel("Age")
plt.ylabel("Price")
plt.title("Price distribution by age")

**Even with the exceptions, it is notable that the older the cars, the cheaper they get.**

In [None]:
bins = [0, 30000, 60000, 90000, 120000, 150000, 180000, 300000]
labels = ["0-30k", "30k-60k", "60k-90k", "90k-120k", "120k-150k", "150k-180k", "180k+"]

df["Miles range"] = pd.cut(df["Mileage"], bins=bins, labels=labels)

df_grouped = df.groupby("Miles range")["Price"].mean().reset_index()

plt.figure(figsize=(10,5))
custom_palette = ["#FF5733", "#FFC300", "#3498DB", "#2ECC71", "#FF5733"]
sns.barplot(x = "Miles range", y = "Price", data=df_grouped, palette=custom_palette)

plt.xlabel("Mileage range")
plt.ylabel("Average price ($)")
plt.title("Average price of cars by mileage range")
plt.show()

**Taking mileage as a parameter now, it is also noticeable how high mileage lowers the price of cars.**

In [None]:
from scipy import stats

miles_range = df["Miles range"]
miles_range_groups = [df[df["Miles range"] == range_]["Price"] for range_ in miles_range.unique()]

f_stat, p_value = stats.f_oneway(*miles_range_groups)
print(f"F-Statistic: {f_stat}, P-Value: {p_value}")

**These results of the F statistic and P value reinforce the information provided by the graph above, how related mileage and price are.**

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df['Brand Number'] = label_encoder.fit_transform(df['Brand'])
#giving numbers to brands in order to create a correlation with them

df['Miles Range number'] = label_encoder.fit_transform(df['Miles range'])
#giving numbers to miles range number in order to create a correlation with them

In [None]:
df_filtered = df.drop(columns=["Brand", "Model", "Fuel_Type", "Transmission", "Doors", "Owner_Count", "Miles range"])
correlation_matrix = df_filtered.corr()
#these variables were excluded so as not to affect the generation of the correlation matrix

plt.figure(figsize=(12,6))
sns.heatmap(correlation_matrix, annot = True, cmap="coolwarm", fmt=".2f")
plt.title("Matriz de correlação")
plt.show()

**Once again we see the connection that age and mileage have with the price of the car**

In [None]:
from scipy import stats

df_1_owner = df[df["Owner_Count"] == 1]["Price"]
df_2m_owners = df[df["Owner_Count"] > 1]["Price"]

t_stat, p_value = stats.ttest_ind(df_1_owner, df_2m_owners)
print(f"T-Statistic: {t_stat}, P-Value: {p_value}")

In [None]:
import seaborn as sns

plt.figure(figsize=(12,6))
sns.boxplot(x = "Owner_Count", y = "Price", data=df)

plt.xticks(rotation=90)
plt.xlabel("Owner_Count")
plt.ylabel("Price")
plt.title("Price distribution by Owner_Count")

**Both the tests and the graph make it clear that the number of previous owners has almost no effect on the price of the cars.**

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np

x = df[["Age"]].values
y = df["Price"]

model = LinearRegression()

model.fit(x, y)

y_pred = model.predict(x)

residuals = y - y_pred
std_error = np.std(residuals)

upper_bound = y_pred + 1.96 * std_error
lower_bound = y_pred - 1.96 * std_error

plt.figure(figsize=(12,6))
plt.scatter(x, y, color = 'blue', label = 'Real data')
plt.plot(x, y_pred, color='red', label='Prediction (regression line)')
plt.fill_between(x.flatten(), lower_bound, upper_bound, color='gray', alpha=0.2, label='Confidence Interval')
plt.title('Price prediction based on car age')
plt.xlabel("Car age")
plt.ylabel("Price")
plt.legend()
plt.show()

**Even with points far from the regression line (red), the gray space, which represents the confidence interval, shows that the age of the car is a very important variable when setting a price to sell a car.**