In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/kaggle/input/airbnbopendata/Airbnb_Open_Data.csv', low_memory=False)


In [None]:
df.head()

In [None]:
print(df.info())

In [None]:
# Handle missing values
# Drop columns with too many missing values
df = df.drop(columns=['license'])

In [None]:
# Fill missing values in numerical columns with median
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

In [None]:
# Fill missing values in categorical columns with mode
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
# Ensure proper downcasting
df = df.infer_objects()

In [None]:
# Convert price and service fee columns to numeric after removing currency symbols
if 'price' in df.columns:
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
else:
    print("Column 'price' does not exist.")

if 'service_fee' in df.columns:
    df['service_fee'] = df['service_fee'].replace('[\$,]', '', regex=True).astype(float)
else:
    print("Column 'service_fee' does not exist.")

In [None]:
# Convert last review to datetime
if 'last_review' in df.columns:
    df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

In [None]:
# Convert categorical columns to category dtype
categorical_columns = ['name', 'host_identity_verified', 'host_name', 'neighbourhood_group', 'neighbourhood',
                       'country', 'country_code', 'instant_bookable', 'cancellation_policy', 'room_type', 'house_rules']
categorical_columns = [col for col in categorical_columns if col in df.columns]  
# Check if columns exist
df[categorical_columns] = df[categorical_columns].astype('category')

In [None]:
# Handle duplicates
df = df.drop_duplicates()

In [None]:
# Format column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [None]:
# Strip whitespace from string columns
for col in categorical_columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.strip()

In [None]:
print(df.info())
print(df.describe())
print(df.head())

In [None]:
# Ensure 'year_month' is a string or datetime
df['year_month'] = pd.to_datetime(df['last_review']).dt.to_period('M')
price_trends = df.groupby('year_month')['price'].mean().reset_index()


In [None]:
import numpy as np

# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
# Check for NaN values to confirm replacement
nan_values = df.isna().sum()
print("NaN values in each column:\n", nan_values)


# # Analysis and Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)


# Set style for seaborn
sns.set(style="whitegrid")

# Distribution of Listing Prices by Neighbourhood
To understand how prices vary across different neighbourhoods.

In [None]:
plt.figure(figsize=(14, 8))
sns.boxplot(x='price', y='neighbourhood_group', data=df, palette='viridis')
plt.title('Price Distribution by Neighbourhood Group')
plt.xlabel('Price')
plt.ylabel('Neighbourhood Group')
plt.xscale('log')  # Log scale for better visualization of price distribution
plt.show()


# Count of listing by Roomtype
visualizing the count of listings by room type providing valuable insights into the distribution of different room types within the dataset

In [None]:
# Count of listings by room type
plt.figure(figsize=(10, 6))
sns.countplot(y='room_type', data=df, palette='viridis')
plt.title('Count of Listings by Room Type')
plt.xlabel('Count')
plt.ylabel('Room Type')
plt.show()

# Average price by neighbourhood group

In [None]:
# Example: Average price by neighbourhood group
plt.figure(figsize=(12, 8))
sns.barplot(x='price', y='neighbourhood_group', data=df, estimator='mean', palette='coolwarm')
plt.title('Average Price by Neighbourhood Group')
plt.xlabel('Average Price')
plt.ylabel('Neighbourhood Group')
plt.show()

# Room Type Analysis
 Explore the distribution of different room types.

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(y='room_type', data=df, palette='Set2')
plt.title('Distribution of Room Types')
plt.xlabel('Count')
plt.ylabel('Room Type')
plt.show()


# Average Price by Room Type
Objective: To see how the price of listings varies by room type.

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='room_type', y='price', data=df, estimator='mean', palette='coolwarm')
plt.title('Average Price by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Average Price')
plt.show()


# Number of Reviews by Room Type
 Analyze the distribution of reviews across different room types.

In [None]:
#Number of Reviews by Room Type
plt.figure(figsize=(12, 8))
sns.boxplot(x='room_type', y='number_of_reviews', data=df, palette='pastel')
plt.title('Number of Reviews by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Number of Reviews')
plt.show()


# Number of reviews vs. price
This analysis can help identify pricing strategies, market trends, and quality indicators, revealing anomalies and customer preferences.

In [None]:
#Number of reviews vs. price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='number_of_reviews', y='price', data=df, alpha=0.5)
plt.title('Number of Reviews vs. Price')
plt.xlabel('Number of Reviews')
plt.ylabel('Price')
plt.show()

## Conclusion

In this analysis, we explored various aspects of Airbnb listings, including the distribution of listing prices by neighborhood, the count of listings by room type, and the relationship between the number of reviews and listing prices.

### Key Findings:
- **Distribution of Listing Prices:** Prices vary significantly by neighborhood, with higher average prices in central areas compared to more suburban locations.
- **Room Type Analysis:** The majority of listings are entire homes or apartments, followed by private rooms. Shared rooms constitute a small percentage of the total.
- **Price vs. Number of Reviews:** Listings with more reviews generally have higher prices, which may indicate higher demand or better quality.

### Conclusion:
The analysis provides valuable insights into how listing prices and review counts vary by neighborhood and room type. These insights can inform potential renters and property owners about market trends and pricing strategies.

### Limitations:
- The dataset may have missing values and inconsistencies that could affect the accuracy of the results.
- The analysis is limited to the available data and may not account for all factors influencing pricing and reviews.

### Future Work:
- Additional analysis could explore the impact of seasonal variations on listing prices.
- Integrating external data, such as local events or economic conditions, could provide a more comprehensive understanding of pricing dynamics.

## References
- Airbnb Open Data /kaggle/input/airbnbopendata/Airbnb_Open_Data.csv
- Data visualization tools used: Seaborn, Matplotlib

## Acknowledgments
- Thanks to the Kaggle community for providing valuable resources and support.
