# Hostel Price Prediction - Exploratory Data Analysis

This notebook explores the hostel dataset to understand the relationships between features and the target variable (price_per_night).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load data
df = pd.read_csv('../data/hostel_data.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic information about the dataset
df.info()

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

## Distribution of Target Variable (Price)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df['price_per_night'], bins=20, edgecolor='black')
plt.xlabel('Price per Night (€)')
plt.ylabel('Frequency')
plt.title('Distribution of Hostel Bed Prices')
plt.grid(True, alpha=0.3)
plt.show()

## Correlation Analysis

In [None]:
# Encode categorical variables for correlation analysis
df_encoded = df.copy()

# Label encode categorical variables
categorical_cols = ['city', 'room_type', 'season', 'bathroom_type']
le_dict = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col + '_encoded'] = le.fit_transform(df_encoded[col])
    le_dict[col] = le

# Select numerical columns for correlation
numerical_cols = df_encoded.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df_encoded[numerical_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Hostel Features')
plt.tight_layout()
plt.show()

## Relationship Between Key Features and Price

In [None]:
# Price vs Distance to City Center
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='distance_to_center_km', y='price_per_night', hue='city')
plt.xlabel('Distance to City Center (km)')
plt.ylabel('Price per Night (€)')
plt.title('Price vs Distance to City Center')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Price by Room Type
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='room_type', y='price_per_night')
plt.xlabel('Room Type')
plt.ylabel('Price per Night (€)')
plt.title('Price Distribution by Room Type')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Price vs Rating
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='rating', y='price_per_night', hue='city')
plt.xlabel('Hostel Rating')
plt.ylabel('Price per Night (€)')
plt.title('Price vs Hostel Rating')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Price by Season
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='season', y='price_per_night')
plt.xlabel('Season')
plt.ylabel('Price per Night (€)')
plt.title('Price Distribution by Season')
plt.grid(True, alpha=0.3)
plt.show()

## Amenities Analysis

In [None]:
# Count of amenities
amenity_cols = ['breakfast_included', 'wifi', 'laundry', 'kitchen_access', 
                'air_conditioning', 'locker_available', 'security_24h', 
                'female_only_dorm_available']

amenity_counts = df[amenity_cols].sum()

plt.figure(figsize=(12, 6))
amenity_counts.plot(kind='bar')
plt.xlabel('Amenities')
plt.ylabel('Count of Hostels')
plt.title('Availability of Amenities Across Hostels')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Average price by amenities
avg_price_by_amenities = {}
for amenity in amenity_cols:
    avg_price_by_amenities[amenity] = df[df[amenity] == 1]['price_per_night'].mean()

plt.figure(figsize=(12, 6))
plt.bar(avg_price_by_amenities.keys(), avg_price_by_amenities.values())
plt.xlabel('Amenities')
plt.ylabel('Average Price per Night (€)')
plt.title('Average Price by Amenities')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## City-wise Analysis

In [None]:
# Average price by city
plt.figure(figsize=(12, 6))
city_avg_price = df.groupby('city')['price_per_night'].mean().sort_values(ascending=False)
city_avg_price.plot(kind='bar')
plt.xlabel('City')
plt.ylabel('Average Price per Night (€)')
plt.title('Average Price by City')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Demand Factors Analysis

In [None]:
# Occupancy Rate vs Price
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='occupancy_rate', y='price_per_night')
plt.xlabel('Occupancy Rate')
plt.ylabel('Price per Night (€)')
plt.title('Price vs Occupancy Rate')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Number of Reviews vs Price
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='num_reviews', y='price_per_night')
plt.xlabel('Number of Reviews')
plt.ylabel('Price per Night (€)')
plt.title('Price vs Number of Reviews')
plt.grid(True, alpha=0.3)
plt.show()

## Summary Insights

Based on the exploratory data analysis, we can observe:

1. **Price Distribution**: The distribution of hostel prices shows variation across different price points.

2. **Key Correlations**: 
   - Distance to city center may impact pricing
   - Ratings and number of reviews correlate with price
   - Seasonal factors affect pricing

3. **Amenities Impact**: Certain amenities like breakfast, air conditioning, and 24h security may increase pricing.

4. **City Differences**: Different cities show varying average prices, indicating location as a significant factor.

These insights will guide our feature engineering and model selection process.