In [31]:
### 1. The Recursive Feature Elimination (RFE) using a Lasso model

In [15]:

# imports for data handling
import pandas as pd

#lasso
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE


data = pd.read_csv('listings.csv')

data_limited = data[["id", "name", "host_id", "host_name", 
                     "neighbourhood_cleansed", "latitude", "longitude", 
                     "room_type", "price", "minimum_nights", 
                     "number_of_reviews", "last_review", "review_scores_rating", 
                     "review_scores_accuracy", "review_scores_cleanliness", 
                     "review_scores_checkin", "review_scores_communication", 
                     "review_scores_location", "review_scores_value", 
                     "reviews_per_month", "calculated_host_listings_count", 
                     "availability_365"]]


# Removing rows with no reviews
data_filtered = data_limited.loc[data_limited['number_of_reviews'] != 0]

# Remove NaN values
data_filtered = data_filtered.dropna()

# Correct neighborhood names
data_filtered["neighbourhood_cleansed"] = data_filtered["neighbourhood_cleansed"].replace({
    "Nrrebro": "Nørrebro", "sterbro": "Østerbro", "Vanlse": "Vanløse", "Brnshj-Husum": "Brønshøj-Husum"})

data_filtered['price'] = data_filtered['price'].replace('[\\$,]', '', regex=True).astype(float)


# Dropping irrelevant columns 
columns_to_remove = ['id', 'host_id', 'calculated_host_listings_count']
filtered_data = data_filtered.drop(columns=columns_to_remove)

# Select the features (excluding non-numeric and target variables)
X = data_filtered.drop(columns=['price', 'name', 'host_name', 'neighbourhood_cleansed', 'last_review', 'room_type'])
y = data_filtered['price']

# Initialize Lasso model
lasso = Lasso(alpha=0.0001)

# Perform Recursive Feature Elimination (RFE)
rfe = RFE(estimator=lasso, n_features_to_select=5)  # Selecting top 5 features
rfe.fit(X, y)

# Get the ranking of features
ranking = pd.DataFrame({
    'Feature': X.columns,
    'Ranking': rfe.ranking_
}).sort_values(by='Ranking')

print(ranking.head())



                      Feature  Ranking
3                   longitude        1
2                    latitude        1
11     review_scores_location        1
8   review_scores_cleanliness        1
12        review_scores_value        1


In [2]:
# imports for data handling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# visualization
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz
from IPython.display import Image, display

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge, ElasticNet, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


data = pd.read_csv('listings.csv')

data_limited = data[["id", "name", "host_id", "host_name", 
                     "neighbourhood_cleansed", "latitude", "longitude", 
                     "room_type", "price", "minimum_nights", 
                     "number_of_reviews", "last_review", "review_scores_rating", 
                     "review_scores_accuracy", "review_scores_cleanliness", 
                     "review_scores_checkin", "review_scores_communication", 
                     "review_scores_location", "review_scores_value", 
                     "reviews_per_month", "calculated_host_listings_count", 
                     "availability_365"]]


# Removing rows with no reviews
data_filtered = data_limited.loc[data_limited['number_of_reviews'] != 0]
# Remove NaN values
data_filtered = data_filtered.dropna()

# Correct neighborhood names
data_filtered["neighbourhood_cleansed"] = data_filtered["neighbourhood_cleansed"].replace({
    "Nrrebro": "Nørrebro", "sterbro": "Østerbro", "Vanlse": "Vanløse", "Brnshj-Husum": "Brønshøj-Husum"})

# Remove dollar signs and commas and convert to float
data_filtered['price'] = data_filtered['price'].replace('[\\$,]', '', regex=True).astype(float)


# Defining features and target
features = ['latitude', 'longitude', 'review_scores_cleanliness', 'review_scores_location', 'review_scores_value']
target = 'price'

# Giving X and y the vars
X = data_filtered[features]
y = data_filtered[target]

# Train-test split, test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Initialize the models
lasso = Lasso(alpha=0.01)
ridge = Ridge(alpha=0.01)
ols = LinearRegression()

# Fit the models
lasso.fit(X_train, y_train)
ridge.fit(X_train, y_train)
ols.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lasso = lasso.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_ols = ols.predict(X_test)

# Calculate performance metrics for each model
metrics = {
    'Model': ['Lasso', 'Ridge', 'OLS'],
    'MSE': [
        mean_squared_error(y_test, y_pred_lasso),
        mean_squared_error(y_test, y_pred_ridge),
        mean_squared_error(y_test, y_pred_ols)
    ],
    'R^2': [
        r2_score(y_test, y_pred_lasso),
        r2_score(y_test, y_pred_ridge),
        r2_score(y_test, y_pred_ols)
    ]
}

results_df  = pd.DataFrame(metrics)
print(results_df )

# Train predictions
y_train_pred_lasso = lasso.predict(X_train)
y_train_pred_ridge = ridge.predict(X_train)
y_train_pred_ols = ols.predict(X_train)

# Compute metrics for training set
train_metrics = {
    'Model': ['Lasso', 'Ridge', 'OLS'],
    'MSE (Train)': [
        mean_squared_error(y_train, y_train_pred_lasso),
        mean_squared_error(y_train, y_train_pred_ridge),
        mean_squared_error(y_train, y_train_pred_ols)
    ],
    'R^2 (Train)': [
        r2_score(y_train, y_train_pred_lasso),
        r2_score(y_train, y_train_pred_ridge),
        r2_score(y_train, y_train_pred_ols)
    ],
    'MSE (Test)': [
        mean_squared_error(y_test, y_pred_lasso),
        mean_squared_error(y_test, y_pred_ridge),
        mean_squared_error(y_test, y_pred_ols)
    ],
    'R^2 (Test)': [
        r2_score(y_test, y_pred_lasso),
        r2_score(y_test, y_pred_ridge),
        r2_score(y_test, y_pred_ols)
    ]
    
}

comparison_df = pd.DataFrame(train_metrics)
print(comparison_df)




   Model           MSE       R^2
0  Lasso  1.276689e+06  0.012196
1  Ridge  1.276709e+06  0.012180
2    OLS  1.276705e+06  0.012183
   Model   MSE (Train)  R^2 (Train)    MSE (Test)  R^2 (Test)
0  Lasso  1.569109e+06     0.012143  1.276689e+06    0.012196
1  Ridge  1.569109e+06     0.012143  1.276709e+06    0.012180
2    OLS  1.569109e+06     0.012143  1.276705e+06    0.012183


In [None]:
### Baseline

In [3]:
baseline_pred_train = np.full_like(y_train, y_train.mean())
baseline_pred_test = np.full_like(y_test, y_train.mean())  # Use training mean for test predictions too

# Calculate MSE and R^2 for baseline model
baseline_metrics = {
    'Model': ['Baseline'],
    'MSE (Train)': [mean_squared_error(y_train, baseline_pred_train)],
    'R^2 (Train)': [r2_score(y_train, baseline_pred_train)],
    'MSE (Test)': [mean_squared_error(y_test, baseline_pred_test)],
    'R^2 (Test)': [r2_score(y_test, baseline_pred_test)]
}

# Convert the baseline results into a DataFrame for comparison
baseline_df = pd.DataFrame(baseline_metrics)

# Append the baseline metrics to the comparison DataFrame
full_comparison_df = pd.concat([comparison_df, baseline_df], ignore_index=True)

# Display 
full_comparison_df

Unnamed: 0,Model,MSE (Train),R^2 (Train),MSE (Test),R^2 (Test)
0,Lasso,1569109.0,0.012143,1276689.0,0.012196
1,Ridge,1569109.0,0.012143,1276709.0,0.01218
2,OLS,1569109.0,0.012143,1276705.0,0.012183
3,Baseline,1588396.0,0.0,1292632.0,-0.00014
