In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.metrics import mean_squared_error
from sklearn import linear_model

In [2]:
df = pd.read_csv('airbnb_dataset/data.csv')
df.head()

Unnamed: 0,id,log_price,accommodates,bathrooms,description,host_response_rate,name,bedrooms,beds,property_type_Apartment,...,city_Boston,city_Chicago,city_DC,city_LA,city_NYC,city_SF,cleaning_fee_False,cleaning_fee_True,host_has_profile_pic_f,host_has_profile_pic_t
0,6901257,5.010635,3.0,1.0,"Beautiful, sunlit brownstone 1-bedroom in the ...",100.0,Beautiful brownstone 1-bedroom,1.0,1.0,1,...,0,0,0,0,1,0,0,1,0,1
1,6304928,5.129899,7.0,1.0,Enjoy travelling during your stay in Manhattan...,100.0,Superb 3BR Apt Located Near Times Square,3.0,3.0,1,...,0,0,0,0,1,0,0,1,0,1
2,7919400,4.976734,5.0,1.0,The Oasis comes complete with a full backyard ...,100.0,The Garden Oasis,1.0,3.0,1,...,0,0,0,0,1,0,0,1,0,1
3,13418779,6.620073,4.0,1.0,This light-filled home-away-from-home is super...,100.0,Beautiful Flat in the Heart of SF!,2.0,2.0,0,...,0,0,0,0,0,1,0,1,0,1
4,3808709,4.744932,2.0,1.0,"Cool, cozy, and comfortable studio located in ...",100.0,Great studio in midtown DC,0.0,1.0,1,...,0,0,1,0,0,0,0,1,0,1


## Major Preprocessing

In [3]:
descriptions = df['description']
descriptions[:10]

0    Beautiful, sunlit brownstone 1-bedroom in the ...
1    Enjoy travelling during your stay in Manhattan...
2    The Oasis comes complete with a full backyard ...
3    This light-filled home-away-from-home is super...
4    Cool, cozy, and comfortable studio located in ...
5    Beautiful private room overlooking scenic view...
6    Warm and cozy studio with full kitchen and bat...
7    Arguably the best location (and safest) in dow...
8    Garden Studio with private entrance from the s...
9    Quiet community. Close to supermarkets,restaur...
Name: description, dtype: object

### Bag of Words

In [4]:
def get_word_count():

    word_Count = defaultdict(int)
    punctuation = set(string.punctuation)
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer() # Initialize a PorterStemmer object for stemming words to their root form

    for d in descriptions:
        # Remove punctuation from the description and convert it to lowercase
        r = ''.join([c for c in d.lower() if not c in punctuation])
        
        # Split the cleaned description into words and filter out stop words
        ws = [w for w in r.split() if w not in stop_words]
        
        for w in ws:
            # Stem each word to its root form
            w = stemmer.stem(w)
            word_Count[w] += 1

    # Save the word count to a file
    with open('airbnb_dataset/word_count.txt', 'w') as f:
        for word, count in word_Count.items():
            f.write(f"{word} {count}\n")

    return word_Count



In [5]:
word_Count = {}
# Try to load from a created file
try:
    with open('airbnb_dataset/word_count.txt', 'r') as f:
        for line in f:
            word, count = line.split()
            word_Count[word] = int(count)
except FileNotFoundError:
    word_Count = get_word_count()

len(word_Count)

76338

In [6]:
counts = [(word_Count[w], w) for w in word_Count]
counts.sort(reverse=True)
words = [x[1] for x in counts[:1500]]

In [7]:
words[:20]

['room',
 'apart',
 'bedroom',
 'walk',
 'locat',
 'kitchen',
 'park',
 'bed',
 'live',
 'privat',
 'restaur',
 'bathroom',
 'access',
 '2',
 'place',
 'block',
 'minut',
 'one',
 'space',
 'neighborhood']

### Sentiment Analysis

In [8]:
wordID = dict(zip(words, range(len(words))))
wordSet = set(words)

In [9]:
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))

def feature(description):
    feat = [0] * len(words)
    if description:
        r = ''.join([c for c in description.lower() if not c in punctuation])
        ws = [w for w in r.split() if w not in stop_words]
        for w in ws: 
            if w in words:
                feat[wordID[w]] += 1
    feat.append(1)
    return feat

In [10]:
X = [feature(d) for d in descriptions]
y = np.array(df['log_price'])

In [11]:
model = linear_model.Ridge(1.0, fit_intercept=False)
model.fit(X, y)
theta = model.coef_

In [12]:
predictions = model.predict(X)
mse = mean_squared_error(predictions, y)
mse

0.32581051625192575

In [13]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [14]:
sentimentDict = dict(zip(words, theta[:-1]))

In [15]:
sentimentDict['great']

0.02092179457041348

In [16]:
# Save sentimentDict to a file
with open('airbnb_dataset/sentiment_dict.txt', 'w') as f:
        for word, score in sentimentDict.items():
            f.write(f"{word} {score}\n")