<a href="https://colab.research.google.com/github/BognarAndras/Girls_day_2023/blob/main/Airbnb_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install streamlit -q

In [2]:

%%writefile app.py
import pandas as pd
import streamlit as st
from datetime import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from itertools import compress
from datetime import datetime


if 'features' not in st.session_state:

  def get_features():
    features = ["accommodates", "host_is_superhost",  "minimum_nights","room_type", "host_identity_verified",  "bedrooms", "property_type"]
    return(features)
  st.session_state.features = get_features()

@st.cache_data()
def get_data():
  data = pd.read_csv("http://data.insideairbnb.com/brazil/rj/rio-de-janeiro/2023-03-28/data/listings.csv.gz")
  return(data)

df = get_data()



with st.sidebar.form("my_form"):

  st.sidebar.radio("Which country Airbnb data to use?", options=["Rio de Janeiro", "Barcelona", "Budapest"])


  st.header('select features')
  feature_1 = st.checkbox("Number of guests",value=True)
  feature_2 = st.checkbox("Superhost",value=True)
  feature_3 = st.checkbox("Number of minimum nights",value=True)
  feature_4 = st.checkbox("House or room",value=True)
  feature_5 = st.checkbox("Verified host",value=True)
  feature_6 = st.checkbox("Number of bedrooms",value=True)
  feature_7 = st.checkbox("Apartment or shared unit",value=True)


  submitted = st.form_submit_button("Update features")


  if submitted:
    features_list = ["accommodates", "host_is_superhost",  "minimum_nights","room_type", "host_identity_verified",  "bedrooms", "property_type"]
    select_list = [feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7] 
    st.session_state.features = list(compress(features_list, select_list))

st.header('Create your model')

@st.cache_data()
def clean_data(df):
  feature_df = df[df['property_type'].isin(["Entire rental unit", "Entire serviced apartment",
                             "Private room in rental unit", "Private room in serviced apartment"])]
                             
  feature_df = feature_df[(feature_df['accommodates'] > 1) & (feature_df['accommodates'] < 7)]
  feature_df = feature_df[feature_df['bedrooms'] < 6]

  feature_df['price_clean'] = feature_df['price'].apply(lambda x: int(round(float(x.strip('$').replace(",","")))))
  feature_df = feature_df[(feature_df['price_clean'] > 50) & (feature_df['price_clean'] <= 1100)]
  feature_df = feature_df[["price_clean", "accommodates", "bedrooms", "property_type","room_type", "host_identity_verified","host_is_superhost","minimum_nights"]]
  feature_df['host_identity_verified'] = feature_df['host_identity_verified'].fillna('f')

  numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

  num_cols = list(feature_df.select_dtypes(include=numerics).columns)

  def factorize_text (data, columns):
    data[columns] = list(data[columns].factorize()[0])
    return data

  for col in list(feature_df.columns):
    if col not in num_cols:
      factorize_text(feature_df, col)
  return(feature_df)
df = clean_data(df)


def predict_prices(df):

  x = df.loc[:, st.session_state.features]
  y = df.loc[:, ['price_clean']]
  X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=416, train_size = .8)


  ols = linear_model.LinearRegression()
  ols.fit(X_train, y_train)
  ypred = ols.predict(X_test)
  rmse = round(mean_squared_error(ypred, y_test, squared=False),1)
  return(y_test, ypred,rmse)
y_test, ypred, rmse = predict_prices(df)

st.subheader("Average prediction error is {} dollars.".format(rmse))


def get_plot(y_test, ypred):

  x_ax = range(len(y_test))
  fig, ax =  plt.subplots(1,1,figsize=(10,4))
  ax.scatter(ypred, y_test, s=5, color="blue", label="Predicted prices")
  ax.plot(y_test, y_test, lw=0.8, color="red", label="Actual prices")
  ax.legend()
  return(fig)
fig = get_plot(y_test, ypred)

st.pyplot(fig)

@st.cache_data()
def get_lasso_data(data):

  lasso_df = data[data['property_type'].isin(["Entire rental unit", "Entire serviced apartment",
                              "Private room in rental unit", "Private room in serviced apartment"])]
                              
  lasso_df = lasso_df[(lasso_df['accommodates'] > 1) & (lasso_df['accommodates'] < 7)]
  lasso_df = lasso_df[lasso_df['bedrooms'] < 6]

  lasso_df['price_clean'] = lasso_df['price'].apply(lambda x: int(round(float(x.strip('$').replace(",","")))))
  lasso_df = lasso_df[(lasso_df['price_clean'] > 50) & (lasso_df['price_clean'] <= 1100)]

  lasso_df['calendar_last_scraped'] = lasso_df['calendar_last_scraped'].apply(lambda x: datetime.strptime(str(x), '%Y-%m-%d'))
  lasso_df['host_since'] = lasso_df['host_since'].fillna('2010-03-26')
  lasso_df['host_since'] = lasso_df['host_since'].apply(lambda x: datetime.strptime(str(x), '%Y-%m-%d'))

  lasso_df['host_since'] = (lasso_df['calendar_last_scraped'] - lasso_df['host_since'] ).dt.days

  lasso_df['host_identity_verified'] = lasso_df['host_identity_verified'].fillna('f')
  lasso_df = lasso_df[["price_clean", "accommodates", "bedrooms", "property_type","room_type", "host_identity_verified","host_is_superhost","minimum_nights","neighbourhood_cleansed","host_since",
                      "instant_bookable"]]

  numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

  num_cols = list(lasso_df.select_dtypes(include=numerics).columns)

  def factorize_text (data, columns):
    data[columns] = list(data[columns].factorize()[0])
    return data

  for col in list(lasso_df.columns):
    if col not in num_cols:
      factorize_text(lasso_df, col)
  return(lasso_df)

st.header('Learn from the machine')

lasso_df = get_lasso_data(get_data())

@st.cache_data()
def get_lasso_plot(lasso_df):

  x = lasso_df[list(set(lasso_df.columns) - set(['price_clean']))]  
  y = lasso_df["price_clean"]     
  X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=416, train_size = .8)

  reg = linear_model.Lasso()
  reg.fit(X_train, y_train)

  coef = pd.Series(reg.coef_, index = X_train.columns)

  text = "Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variable"

  imp_coef = coef.sort_values()

  fig, ax =  plt.subplots(1,1,figsize=(10,4))
  plt.rcParams['figure.figsize'] = (8.0, 10.0)
  imp_coef.plot(kind = "barh")

  plt.title("Feature importance using Lasso Model")
  ypred = reg.predict(X_test)
  rmse = round(mean_squared_error(ypred, y_test, squared=False),1)
  return(text, fig, rmse, reg, list(X_train.columns))

text, fig2, rmse2, reg, fit_cols = get_lasso_plot(lasso_df)

st.subheader(text)
st.subheader("Average prediction error is {} dollars.".format(rmse2))
st.pyplot(fig2)


st.header('Evaluate results')

with st.form("prediction_form"):
  accommodates = st.number_input('Number of guests',value=1)
  host_is_superhost = st.selectbox("Is the host a superhost?",["Yes","No"])
  if host_is_superhost == "Yes":
    host_is_superhost = 1
  else:
    host_is_superhost = 0
  minimum_nights = st.number_input('Minimum nights to stay',value=1)
  room_type = st.selectbox("Is it a room or and entire home?",["Home/Apartment","Room"])
  if room_type == "Home/Apartment":
    room_type = 1
  else:
    room_type = 0
  host_identity_verified = st.selectbox("Was host identiy verified?",["Yes","No"])
  if host_identity_verified == "Yes":
    host_identity_verified = 1
  else:
    host_identity_verified = 0
  bedrooms = st.number_input('Number of bedrooms',value=1)
  property_type = st.selectbox("Was is the property type?",["Entire rental unit", "Entire serviced apartment",
                             "Private room in rental unit", "Private room in serviced apartment"])
  if property_type == "Entire rental unit":
    property_type = 0
  elif property_type == "Entire serviced apartment":
    property_type = 1
  elif property_type == "Private room in rental unit":
    property_type = 2
  else:
    property_type = 3

  predicted = st.form_submit_button("Add parameters")

  if predicted:
    data = pd.DataFrame({'accommodates':accommodates,'bedrooms':bedrooms,'property_type':property_type,
    'room_type':room_type,'host_identity_verified':host_identity_verified,'host_is_superhost':host_is_superhost,
    'minimum_nights':minimum_nights,'neighbourhood_cleansed':14,'host_since':2833,'instant_bookable':0,}, index=[0])
    
    st.subheader("Predicted price is {} dollars.".format(round(reg.predict(data[fit_cols])[0]),0))



# reg.predict(pd.DataFrame({'instant_bookable':0,'host_identity_verified':1,'accommodates':4,'host_is_superhost':0,'room_type':0,'property_type':0,'minimum_nights':2,'neighbourhood_cleansed':14,'host_since':2833,'bedrooms':1.0}, index=[0]))

Overwriting app.py


In [3]:
!streamlit run app.py & npx localtunnel --port 8501

[##................] - fetchMetadata: sill resolveWithNewModule ms@2.1.2 checki[0m[K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://104.199.208.52:8501[0m
[0m
[K[?25hnpx: installed 22 in 5.74s
your url is: https://shaggy-lamps-search-104-199-208-52.loca.lt
[34m  Stopping...[0m
^C
