In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


In [5]:
df = pd.read_csv('airbnb_dataset/data.csv')

vectorizer = TfidfVectorizer(max_features=3000, stop_words='english' and not word.isdigit())
X = vectorizer.fit_transform(df['description']).toarray()

In [6]:
y = np.array(df['log_price'])

model = Ridge(alpha=1.0)
model.fit(X, y)
theta = model.coef_

predictions = model.predict(X)
mse = mean_squared_error(y, predictions)

In [7]:
feature_names = vectorizer.get_feature_names_out()
word_coefficients = list(zip(feature_names, theta))
word_coefficients.sort(key=lambda x: x[1], reverse=True)  # Sort by coefficient value

# Print MSE and some of the top coefficients
print(f"MSE: {mse}")
print("Top positive words:", word_coefficients[:10])
print("Top negative words:", word_coefficients[-10:])

MSE: 0.25017994937456023
Top positive words: [('inauguration', 3.6514334188375157), ('shoots', 1.8897555181091374), ('bedrooms', 1.5971022543432194), ('filming', 1.436805655417775), ('bowl', 1.4295557202454081), ('viking', 1.4119257924208357), ('villa', 1.4117952287741302), ('corporate', 1.283327807016469), ('estate', 1.254694641454746), ('groups', 1.2525537201719439)]
Top negative words: [('gabriel', -0.8936266968958946), ('crash', -0.9221533146871831), ('walmart', -0.9352318799834811), ('roommates', -0.968153584455593), ('budget', -0.9957940301125133), ('students', -1.0363411037170591), ('hostel', -1.0646183899320234), ('shared', -1.2151414053262464), ('solo', -1.2208406454425849), ('room', -1.3253673945286715)]


In [8]:
with open('airbnb_dataset/tfidf_dict.txt', 'w') as f:
    for word, score in word_coefficients:
        f.write(f"{word} {score}\n")

In [22]:
vectorizer_1 = TfidfVectorizer(max_features=5000, stop_words='english')
X1 = vectorizer_1.fit_transform(df['name']).toarray()

In [23]:
model1 = Ridge(alpha=1.0)
model1.fit(X1, y)
theta1 = model.coef_

predictions1 = model1.predict(X1)
mse1 = mean_squared_error(y, predictions1)

In [24]:
feature_names1 = vectorizer_1.get_feature_names_out()
word_coefficients1 = list(zip(feature_names1, theta1))
word_coefficients1.sort(key=lambda x: x[1], reverse=True)  # Sort by coefficient value

# Print MSE and some of the top coefficients
print(f"MSE: {mse1}")
print("Top positive words:", word_coefficients1[:10])
print("Top negative words:", word_coefficients1[-10:])

MSE: 0.2817914301175536
Top positive words: [('couldn', 3.6514334188375157), ('inaug', 1.8897555181091374), ('3beds', 1.5971022543432194), ('candlemore', 1.436805655417775), ('52nd', 1.4295557202454081), ('mansion', 1.4119257924208357), ('maple', 1.4117952287741302), ('balconies', 1.283327807016469), ('brown', 1.254694641454746), ('cleaning', 1.2525537201719439)]
Top negative words: [('charme', -0.8936266968958946), ('bat', -0.9221533146871831), ('martial', -0.9352318799834811), ('hollyw', -0.968153584455593), ('69', -0.9957940301125133), ('ladder', -1.0363411037170591), ('contents', -1.0646183899320234), ('iii', -1.2151414053262464), ('jay', -1.2208406454425849), ('hollow', -1.3253673945286715)]


In [26]:
with open('airbnb_dataset/name_dict.txt', 'w') as f:
    for word, score in word_coefficients1:
        f.write(f"{word} {score}\n")