# Double check the file path

In [None]:
import os
print(os.path.exists(r'C:\Users\AAYUSHI GUPTA\Documents\Airbnb.csv'))

# Step 1: Upload the CSV File to Google Colab

In [None]:
from google.colab import files
uploaded = files.upload()

#  Step 2: Load the CSV into a DataFrame

In [None]:
import pandas as pd

# Use the exact filename you uploaded
df = pd.read_csv('Airbnb.csv')

# Display the first 5 rows
df.head()

# Step 3: Import Other Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

# Configure Seaborn
sns.set(style="whitegrid")

# Step 4: Data Cleaning & Preprocessing


In [None]:
# Check structure
df.info()

# Check for missing values
print(df.isnull().sum())

# Fill missing review values and drop unnecessary columns
df['reviews_per_month'].fillna(0, inplace=True)
df.drop(['name', 'host_name'], axis=1, inplace=True)

# Drop remaining rows with missing values
df.dropna(inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Step 5: Exploratory Data Analysis (EDA)

📊 Room Type Distribution

In [None]:
sns.countplot(x='room_type', data=df)
plt.title("Room Type Distribution")
plt.show()

💰 Price Distribution

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df['price'], bins=100, kde=True)
plt.xlim(0, 500)  # Focus on typical price range
plt.title("Price Distribution")
plt.xlabel("Price ($)")
plt.ylabel("Number of Listings")
plt.show()

📍 Geographical Distribution (Map)

In [None]:
px.scatter_mapbox(df.sample(1000),
                  lat="latitude", lon="longitude",
                  color="price", size="price",
                  hover_name="neighbourhood",
                  mapbox_style="carto-positron",
                  zoom=10,
                  title="Listing Locations with Price")

📅 Availability Analysis

In [None]:
sns.histplot(df['availability_365'], bins=30, kde=True)
plt.title("Availability Throughout the Year")
plt.xlabel("Days Available")
plt.ylabel("Listings Count")
plt.show()

# Step 6: Correlation & Price Analysis

In [None]:
# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=[np.number])

# Then plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Step 7: Neighborhood & Host Analysis

🏘️ Average Price by Neighbourhood Group

In [None]:
avg_price = df.groupby('neighbourhood_group')['price'].mean().sort_values(ascending=False)
avg_price.plot(kind='bar', color='skyblue', title='Avg Price per Neighbourhood Group')
plt.ylabel('Average Price ($)')
plt.show()

👤 Top Hosts by Number of Listings

In [None]:
top_hosts = df['host_id'].value_counts().head(10)
print("Top 10 Hosts by Listings:\n", top_hosts)

# Step 8: Save Cleaned Data

In [None]:
df.to_csv('cleaned_airbnb_data.csv', index=False)