In [5]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


In [6]:
df = pd.read_csv('airbnb_dataset/data.csv')

vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X = vectorizer.fit_transform(df['description']).toarray()

In [7]:
y = np.array(df['log_price'])

model = Ridge(alpha=1.0)
model.fit(X, y)
theta = model.coef_

predictions = model.predict(X)
mse = mean_squared_error(y, predictions)

In [8]:
feature_names = vectorizer.get_feature_names_out()
word_coefficients = list(zip(feature_names, theta))
word_coefficients.sort(key=lambda x: x[1], reverse=True)  # Sort by coefficient value

# Print MSE and some of the top coefficients
print(f"MSE: {mse}")
print("Top positive words:", word_coefficients[:10])
print("Top negative words:", word_coefficients[-10:])

MSE: 0.25017994937456023
Top positive words: [('inauguration', 3.6514334188375157), ('shoots', 1.8897555181091374), ('bedrooms', 1.5971022543432194), ('filming', 1.436805655417775), ('bowl', 1.4295557202454081), ('viking', 1.4119257924208357), ('villa', 1.4117952287741302), ('corporate', 1.283327807016469), ('estate', 1.254694641454746), ('groups', 1.2525537201719439)]
Top negative words: [('gabriel', -0.8936266968958946), ('crash', -0.9221533146871831), ('walmart', -0.9352318799834811), ('roommates', -0.968153584455593), ('budget', -0.9957940301125133), ('students', -1.0363411037170591), ('hostel', -1.0646183899320234), ('shared', -1.2151414053262464), ('solo', -1.2208406454425849), ('room', -1.3253673945286715)]


In [11]:
with open('airbnb_dataset/tfidf_dict.txt', 'w') as f:
    for word, score in word_coefficients:
        f.write(f"{word} {score}\n")