In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
# 1. Load dataset
df = pd.read_csv('listings.csv')

# 2. Clean data
# Remove rows where price is missing or 0
df = df[df['price'].notnull()]
df = df[df['price'] > 0]

# Clean price column if it's a string like "$100.00"
if df['price'].dtype == object:
    df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float)

# Drop rows with missing values in important columns
df = df.dropna(subset=['room_type', 'latitude', 'longitude', 'minimum_nights', 'number_of_reviews'])

In [None]:
#how many listings in each neighborhood
# Count listings in each unique neighborhood
neighborhood_counts = df['neighbourhood'].value_counts()

# Display the counts
print(neighborhood_counts[:10])


In [None]:
neighbourhood_df = df[df['neighbourhood'] == "el Raval"]
#print(neighbourhood_df[:5])

In [None]:
# how many listing for each room type

#counting each unique value
room_type_count = df['room_type'].value_counts()

print(room_type_count)

In [None]:
room_type_df = df[df['room_type'] == 'Entire home/apt']

print(room_type_df[:5])

In [None]:
# Count a specific room type in a neighborhood
room_nei_counts_ = neighbourhood_df['room_type'].value_counts()

# Display the counts
print(room_nei_counts_[:10])

In [None]:
neighbourhood_roomtype_df = neighbourhood_df[neighbourhood_df['room_type'] == "Entire home/apt"]

print(neighbourhood_roomtype_df[:5])

In [None]:
# 3. One-hot encode 'room_type'
#print(df.columns)

room_dummies = pd.get_dummies(df['room_type'], prefix='room')
df = pd.concat([df, room_dummies], axis=1)
df.drop('room_type', axis=1, inplace=True)




In [None]:
print(room_dummies[:10])

In [None]:
# 4. Define features and target
features = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews'] + list(room_dummies.columns)
X = df[features]
y = df['price']

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 6. Train model
model = LinearRegression()
model.fit(X_train, y_train)

# 7. Make predictions
y_pred = model.predict(X_test)

# 8. Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

In [None]:
# Scatter plot using Matplotlib
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='teal')
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Airbnb Prices")
plt.grid(True)
plt.show()



In [None]:
# 9. Analyze errors
error_df = X_test.copy()
error_df['Actual Price'] = y_test
error_df['Predicted Price'] = y_pred
error_df['Error'] = abs(error_df['Actual Price'] - error_df['Predicted Price'])
top_5_errors = error_df.sort_values(by='Error', ascending=False).head(5)
print("\nTop 5 Wrong Predictions:")
print(top_5_errors[['Actual Price', 'Predicted Price', 'Error']])

In [None]:

#This next part is the samething but filtered by neighborhood

In [None]:
# One-hot encode room_type neighbourhood_df = df_outliers[df_outliers['neighbourhood'] == "la Dreta de l'Eixample"]
room_dummies = pd.get_dummies(neighbourhood_df['room_type'], prefix='room')

# Concatenate the dummies with the original DataFrame
neighbourhood_df = pd.concat([neighbourhood_df, room_dummies], axis=1)


In [None]:
# Define features and target
feature_cols = ['number_of_reviews', 'minimum_nights', 'availability_365',
                'room_Entire home/apt', 'room_Private room', 'room_Shared room', 'room_Hotel room']

X = neighbourhood_df[feature_cols]
y = neighbourhood_df['price']

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = LinearRegression()
model.fit(X_train, y_train)

# 7. Make predictions
y_pred = model.predict(X_test)

In [None]:
# 8. Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

In [None]:
# Scatter plot using Matplotlib
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='teal')
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
#plt.xlim(0, 400)  
#plt.ylim(0, 300)
plt.title("Actual vs Predicted Airbnb Prices")
plt.grid(True)
plt.show()

In [None]:
# 9. Analyze errors
error_df = X_test.copy()
error_df['Actual Price'] = y_test
error_df['Predicted Price'] = y_pred
error_df['Error'] = abs(error_df['Actual Price'] - error_df['Predicted Price'])
top_5_errors = error_df.sort_values(by='Error', ascending=False).head(5)
print("\nTop 5 Wrong Predictions:")
print(top_5_errors[['Actual Price', 'Predicted Price', 'Error']])

In [None]:
#this is looking at each room type

In [None]:
# 4. Define features and target
features = ['number_of_reviews', 'minimum_nights', 'availability_365']
X = room_type_df[features]
y = room_type_df['price']

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = LinearRegression()
model.fit(X_train, y_train)

# 7. Make predictions
y_pred = model.predict(X_test)

In [None]:
# 8. Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

In [None]:
# Scatter plot using Matplotlib
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='teal')
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
#plt.xlim(0, 400)  
#plt.ylim(0, 300)
plt.title("Actual vs Predicted Airbnb Prices")
plt.grid(True)
plt.show()

In [None]:
# 9. Analyze errors
error_df = X_test.copy()
error_df['Actual Price'] = y_test
error_df['Predicted Price'] = y_pred
error_df['Error'] = abs(error_df['Actual Price'] - error_df['Predicted Price'])
top_5_errors = error_df.sort_values(by='Error', ascending=False).head(5)
print("\nTop 5 Wrong Predictions:")
print(top_5_errors[['Actual Price', 'Predicted Price', 'Error']])

In [None]:
#This time it is filtered by neighborhood and room type


In [None]:
# Define features and target
feature_cols = ['number_of_reviews', 'minimum_nights', 'availability_365']

X = neighbourhood_roomtype_df[feature_cols]
y = neighbourhood_roomtype_df['price']

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = LinearRegression()
model.fit(X_train, y_train)

# 7. Make predictions
y_pred = model.predict(X_test)

In [None]:
# 8. Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

In [None]:
# Scatter plot using Matplotlib
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='teal')
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
#plt.xlim(0, 400)  
#plt.ylim(0, 300)
plt.title("Actual vs Predicted Airbnb Prices")
plt.grid(True)
plt.show()

In [None]:
# 9. Analyze errors
error_df = X_test.copy()
error_df['Actual Price'] = y_test
error_df['Predicted Price'] = y_pred
error_df['Error'] = abs(error_df['Actual Price'] - error_df['Predicted Price'])
top_5_errors = error_df.sort_values(by='Error', ascending=False).head(50)
print("\nTop 5 Wrong Predictions:")
print(top_5_errors[['Actual Price', 'Predicted Price', 'Error']])