# 🏍️ Bike Dataset EDA

This notebook answers various analysis questions on a bike dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
# Replace with your file path or upload code
df = pd.read_csv('bikes.csv')
df.head()

### 1️⃣ What is the range of selling prices in the dataset?

In [None]:
selling_price_range = df['selling_price'].min(), df['selling_price'].max()
selling_price_range

### 2️⃣ What is the median selling price for bikes in the dataset?

In [None]:
median_selling_price = df['selling_price'].median()
median_selling_price

### 3️⃣ What is the most common seller type?

In [None]:
most_common_seller_type = df['seller_type'].mode()[0]
most_common_seller_type

### 4️⃣ How many bikes have driven more than 50,000 kilometers?

In [None]:
count_high_km = df[df['km_driven'] > 50000].shape[0]
count_high_km

### 5️⃣ Average km_driven for each ownership type

In [None]:
avg_km_by_ownership = df.groupby('owner')['km_driven'].mean()
avg_km_by_ownership

### 6️⃣ Proportion of bikes from 2015 or older

In [None]:
proportion_2015_older = (df[df['year'] <= 2015].shape[0]) / df.shape[0]
proportion_2015_older

### 7️⃣ Missing values trend

In [None]:
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_values

### 8️⃣ Highest ex_showroom_price and corresponding bike

In [None]:
highest_ex_price = df.loc[df['ex_showroom_price'].idxmax(), ['name', 'ex_showroom_price']]
highest_ex_price

### 9️⃣ Total number of bikes listed by each seller type

In [None]:
bikes_by_seller = df['seller_type'].value_counts()
bikes_by_seller

### 🔟 Relationship between selling_price and km_driven for first-owner bikes

In [None]:
sns.scatterplot(data=df[df['owner'] == 'First Owner'], x='km_driven', y='selling_price')
plt.title('Selling Price vs. KM Driven for First Owner Bikes')
plt.show()

### 1️⃣1️⃣ Remove outliers in km_driven using IQR

In [None]:
Q1 = df['km_driven'].quantile(0.25)
Q3 = df['km_driven'].quantile(0.75)
IQR = Q3 - Q1
filtered_df = df[(df['km_driven'] >= Q1 - 1.5 * IQR) & (df['km_driven'] <= Q3 + 1.5 * IQR)]
filtered_df.shape

### 1️⃣2️⃣ Bivariate analysis: Year vs. Selling Price

In [None]:
sns.boxplot(x='year', y='selling_price', data=filtered_df)
plt.xticks(rotation=45)
plt.title('Year vs. Selling Price')
plt.show()

### 1️⃣3️⃣ Average depreciation in selling price based on bike age

In [None]:
from datetime import datetime
current_year = datetime.now().year
df['age'] = current_year - df['year']
avg_depreciation = df.groupby('age')['selling_price'].mean()
avg_depreciation

### 1️⃣4️⃣ Bikes priced significantly above average for their year

In [None]:
df['avg_price_year'] = df.groupby('year')['selling_price'].transform('mean')
above_avg_bikes = df[df['selling_price'] > df['avg_price_year'] * 1.5]
above_avg_bikes[['name', 'year', 'selling_price', 'avg_price_year']]

### 1️⃣5️⃣ Correlation matrix and heatmap

In [None]:
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()