In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore') 

In [4]:
df = pd.read_csv(r"C:\Users\cheru\Desktop\glass.csv")

In [None]:
# Import all necessary libraries at the beginning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib  # For saving the model

# Set visualization style
sns.set(style='whitegrid')

# =========================
# Step 1: Load Data
# =========================
df = pd.read_csv('airbnb_nyc_listings.csv')  # Replace with your dataset path

# Basic info
print("Data Info:")
print(df.info())
print("\nData Description:")
print(df.describe(include='all'))

# =========================
# Step 2: Exploratory Data Analysis (EDA)
# =========================

# Histogram of Prices
plt.figure(figsize=(10,6))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Distribution of Airbnb Prices')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.show()

# Boxplot to detect outliers
plt.figure(figsize=(10,6))
sns.boxplot(x=df['price'])
plt.title('Boxplot of Airbnb Prices')
plt.xlabel('Price ($)')
plt.show()

# Price vs. Number of Bedrooms scatterplot
plt.figure(figsize=(10,6))
sns.scatterplot(x='bedrooms', y='price', data=df)
plt.title('Price vs. Number of Bedrooms')
plt.xlabel('Bedrooms')
plt.ylabel('Price ($)')
plt.show()

# Correlation heatmap
corr = df.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# =========================
# Step 3: Data Preprocessing
# =========================

# Check missing data
print("Missing values per column:\n", df.isnull().sum())

# Fill missing numerical data with median
num_features = ['price', 'bedrooms', 'bathrooms', 'number_of_reviews', 'review_scores_rating']
for col in num_features:
    df[col].fillna(df[col].median(), inplace=True)

# Fill missing categorical data with 'Missing'
cat_features = ['neighbourhood', 'room_type', 'property_type']
for col in cat_features:
    df[col].fillna('Missing', inplace=True)

# Encode categorical features
df_encoded = pd.get_dummies(df, columns=cat_features, drop_first=True)

# Scale numerical features
scaler = StandardScaler()
df_encoded[num_features] = scaler.fit_transform(df_encoded[num_features])

# =========================
# Step 4: Feature Engineering
# =========================

# Define price categories for analysis
price_bins = [0, 100, 200, 300, 400, 500, np.inf]
price_labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High', 'Luxury']
df['price_category'] = pd.cut(df['price'], bins=price_bins, labels=price_labels)

# Visualize distribution of price categories
plt.figure(figsize=(8,5))
sns.countplot(x='price_category', data=df, order=price_labels)
plt.title('Number of Listings per Price Category')
plt.xlabel('Price Category')
plt.ylabel('Count')
plt.show()

# =========================
# Step 5: Feature Range Analysis per Price Category
# =========================

for category in price_labels:
    subset = df[df['price_category'] == category]
    print(f"\n--- {category} Listings ---")
    print(f"Count: {len(subset)}")
    
    # Numerical features
    print("\nNumerical feature ranges:")
    for feature in ['review_scores_rating', 'number_of_reviews', 'bedrooms', 'bathrooms']:
        print(f"{feature}: {subset[feature].min()} - {subset[feature].max()}")
    
    # Categorical feature distributions
    print("\nCategorical feature distributions:")
    for feature in ['neighbourhood', 'room_type', 'property_type']:
        print(f"\n{feature} distribution:")
        print(subset[feature].value_counts())

# =========================
# Step 6: Prepare Data for Modeling
# =========================

# Drop the original 'price' column from features
X = df_encoded.drop(['price', 'id', 'name', 'price_category'], axis=1, errors='ignore')
y = df['price']  # Use original scale for modeling

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =========================
# Step 7: Model Training and Tuning
# =========================

# Example: Random Forest with Grid Search
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

# =========================
# Step 8: Model Evaluation
# =========================

# Predict with the best model
y_pred = best_rf.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\nModel Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R^2: {r2:.2f}")

# Plot predicted vs actual prices
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs. Predicted Prices')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()

# =========================
# Step 9: Save the Best Model
# =========================

joblib.dump(best_rf, 'bnb_price_prediction_model.pkl')
print("Model saved as 'bnb_price_prediction_model.pkl'.")
