In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gzip

In [3]:
csv_file_path = './airbnb_dataset/data.csv'
df = pd.read_csv(csv_file_path)
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,id,log_price,accommodates,bathrooms,description,host_response_rate,name,bedrooms,beds,property_type_Apartment,...,city_Boston,city_Chicago,city_DC,city_LA,city_NYC,city_SF,cleaning_fee_False,cleaning_fee_True,host_has_profile_pic_f,host_has_profile_pic_t
0,6901257,5.010635,3.0,1.0,"Beautiful, sunlit brownstone 1-bedroom in the ...",100.0,Beautiful brownstone 1-bedroom,1.0,1.0,1,...,0,0,0,0,1,0,0,1,0,1
1,6304928,5.129899,7.0,1.0,Enjoy travelling during your stay in Manhattan...,100.0,Superb 3BR Apt Located Near Times Square,3.0,3.0,1,...,0,0,0,0,1,0,0,1,0,1
2,7919400,4.976734,5.0,1.0,The Oasis comes complete with a full backyard ...,100.0,The Garden Oasis,1.0,3.0,1,...,0,0,0,0,1,0,0,1,0,1
3,13418779,6.620073,4.0,1.0,This light-filled home-away-from-home is super...,100.0,Beautiful Flat in the Heart of SF!,2.0,2.0,0,...,0,0,0,0,0,1,0,1,0,1
4,3808709,4.744932,2.0,1.0,"Cool, cozy, and comfortable studio located in ...",100.0,Great studio in midtown DC,0.0,1.0,1,...,0,0,1,0,0,0,0,1,0,1


In [5]:
# Load sentiment_dict from a created file
sentiment_dict = {}
with open('airbnb_dataset/sentiment_dict.txt', 'r') as f:
    for line in f:
        word, theta = line.split()
        sentiment_dict[word] = float(theta)

In [6]:
import string
punctuation = set(string.punctuation)

def sentiment(d):
    sentimentScore = 0
    r = ''.join([c for c in d.lower() if not c in punctuation])
    for w in r.split():
        sentimentScore += sentiment_dict.get(w, 0)
    return sentimentScore

In [7]:
# Calculate each description's sentiment score
def calculate_sentiment(row):
    # This function will be applied to each row in the DataFrame
    if pd.notnull(row['description']):
        return sentiment(row['description'])
    else:
        return 0
    
# Apply the function to each row
df['des_sentiment_analysis'] = df.apply(calculate_sentiment, axis=1)

In [8]:
# Min-Max Nomarlization
cols_to_normalize = [
    'accommodates', 'bathrooms', 'bedrooms', 'beds', 'des_sentiment_analysis','host_response_rate'
]

df[cols_to_normalize] = (df[cols_to_normalize] - df[cols_to_normalize].min()) / (df[cols_to_normalize].max() - df[cols_to_normalize].min())


df.shape, df[cols_to_normalize].head()

((73923, 68),
    accommodates  bathrooms  bedrooms      beds  des_sentiment_analysis  \
 0      0.133333      0.125       0.1  0.055556                0.431269   
 1      0.400000      0.125       0.3  0.166667                0.472832   
 2      0.266667      0.125       0.1  0.166667                0.496588   
 3      0.200000      0.125       0.2  0.111111                0.475399   
 4      0.066667      0.125       0.0  0.055556                0.448757   
 
    host_response_rate  
 0                 1.0  
 1                 1.0  
 2                 1.0  
 3                 1.0  
 4                 1.0  )

In [10]:
# Standardlization 
from sklearn.preprocessing import StandardScaler

columns_to_standardize = [
    'accommodates', 'bathrooms', 'bedrooms', 'beds', 'des_sentiment_analysis','host_response_rate'
]

scaler = StandardScaler()
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

df.shape, df[columns_to_standardize].head()

((73923, 68),
    accommodates  bathrooms  bedrooms      beds  des_sentiment_analysis  \
 0     -0.072621  -0.404046 -0.312048 -0.566461               -0.231021   
 1      1.783653  -0.404046  2.034955  1.027816                0.242287   
 2      0.855516  -0.404046 -0.312048  1.027816                0.512815   
 3      0.391448  -0.404046  0.861454  0.230678                0.271522   
 4     -0.536689  -0.404046 -1.485549 -0.566461               -0.031875   
 
    host_response_rate  
 0            0.296013  
 1            0.296013  
 2            0.296013  
 3            0.296013  
 4            0.296013  )