### Import packages

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from scipy import sparse
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import ward, dendrogram
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.sparse import hstack

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Loading csv file

In [56]:
# listing csv file
listing_df = pd.read_csv("listings-details.csv")
listing_df = listing_df[:1000]
host_desc = listing_df[['id', 'description', 'amenities', 'neighborhood_overview', 'host_about','host_is_superhost']]

# Concatenate all textual features provided by the owner
host_desc['desc'] = host_desc['description'] + ' ' + host_desc['amenities'] + ' ' + host_desc['neighborhood_overview'] + ' ' + host_desc['host_about']

# Loading csv file
reviews = pd.read_csv("reviews.csv")
reviews['comments'] = reviews['comments'].astype(str)  

# Concatenate customer comments in a row by listing's id
concat_reviews = reviews.groupby('listing_id')['comments'].apply(lambda x: ' '.join(x)).reset_index()
concat_reviews = concat_reviews[:2000]


In [4]:
host_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_upd

In [5]:
host_desc.head()

Unnamed: 0,id,description,amenities,neighborhood_overview,host_about,host_is_superhost,desc
0,2595,"Beautiful, spacious skylit studio in the heart...","[""Extra pillows and blankets"", ""Baking sheet"",...",Centrally located in the heart of Manhattan ju...,A New Yorker since 2000! My passion is creatin...,f,"Beautiful, spacious skylit studio in the heart..."
1,3831,"Enjoy 500 s.f. top floor in 1899 brownstone, w...","[""Extra pillows and blankets"", ""Luggage dropof...",Just the right mix of urban center and local n...,Laid-back Native New Yorker (formerly bi-coast...,f,"Enjoy 500 s.f. top floor in 1899 brownstone, w..."
2,5121,<b>The space</b><br />HELLO EVERYONE AND THANK...,"[""Kitchen"", ""Long term stays allowed"", ""Wifi"",...",,"I am an artist(painter, filmmaker) and curato...",f,
3,5136,We welcome you to stay in our lovely 2 br dupl...,"[""Kitchen"", ""BBQ grill"", ""Cable TV"", ""Carbon m...",,"Rebecca is an artist/designer, and Henoch is i...",f,
4,5178,Please don’t expect the luxury here just a bas...,"[""Room-darkening shades"", ""Lock on bedroom doo...","Theater district, many restaurants around here.",I used to work for a financial industry but no...,f,Please don’t expect the luxury here just a bas...


In [6]:
concat_reviews.head()

Unnamed: 0,listing_id,comments
0,2595,Notre séjour de trois nuits.\r<br/>Nous avons ...
1,3831,"lisa is a wonderful, kind and thoughtful host...."
2,5121,"Simple place, super nice guy. Great guy with a..."
3,5136,My family had a wonderful stay at Rebecca and ...
4,5178,"MR. Kasai was a grreat host , very helpful and..."


I will first analyze the textual features before diving into other numerical characteristics.

In [57]:
merged_df = host_desc.merge(concat_reviews, left_on='id', right_on='listing_id', how='inner')

# Concatenate all textual features for tfidf calculation
merged_df['text'] = merged_df['desc'] + ' ' + merged_df['comments']
merged_df = merged_df [['id','text','host_is_superhost']]
merged_df.dropna(inplace=True)


In [58]:
merged_df.head()

Unnamed: 0,id,text,host_is_superhost
0,2595,"Beautiful, spacious skylit studio in the heart...",f
1,3831,"Enjoy 500 s.f. top floor in 1899 brownstone, w...",f
4,5178,Please don’t expect the luxury here just a bas...,f
5,5203,"Our best guests are seeking a safe, clean, spa...",f
6,5803,"Beautiful house, gorgeous garden, patio, cozy ...",t


In [7]:
# Text preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


In [8]:
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r',:\'!"', '', text)
    else:
        text = ''
    return text

def tokenize_and_stem(text):
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

In [59]:
# Preprocess text
merged_df['text'] = merged_df['text'].apply(preprocess_text)

In [60]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem, stop_words=stop_words)
tfidf_matrix = vectorizer.fit_transform(merged_df['text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names())

# Calculate TF-IDF
tf = tfidf_df.div(tfidf_df.max(axis=1), axis=0)
idf = np.log(tfidf_df.shape[0] / (tfidf_df != 0).sum(axis=0))
tfidf_matrix = tf.multiply(idf.values)
tfidf_df = pd.DataFrame(tfidf_matrix, columns=vectorizer.get_feature_names())


In [61]:
tfidf_df.shape

(558, 59961)

In [66]:
tfidf_df = tfidf_df.drop('id', axis=1)

In [62]:
#numerical features that include host and house characteristics
#exclude highly correlated ones
additional = host_df[['id','bedrooms','host_response_rate', \
             'host_acceptance_rate','host_listings_count','minimum_nights', 'maximum_nights',\
             'availability_365','number_of_reviews_ltm','price','review_scores_rating']]


In [63]:
additional.head()

Unnamed: 0,id,bedrooms,host_response_rate,host_acceptance_rate,host_listings_count,minimum_nights,maximum_nights,availability_365,number_of_reviews_ltm,price,review_scores_rating
0,2595,,80%,17%,8.0,30,1125,338,0,$150.00,4.7
1,3831,1.0,9%,69%,1.0,1,730,194,32,$75.00,4.45
2,5121,1.0,100%,100%,1.0,30,730,365,0,$60.00,4.52
3,5136,2.0,100%,25%,1.0,5,1125,123,1,$275.00,5.0
4,5178,1.0,100%,100%,1.0,2,14,192,33,$68.00,4.21


In [64]:
# Preprocess these additional features to all numerical values
additional.loc[:, 'host_response_rate'] = additional['host_response_rate'].str.rstrip('%').astype(float) / 100
additional.loc[:, 'host_acceptance_rate'] = additional['host_acceptance_rate'].str.rstrip('%').astype(float) / 100
additional.loc[:, 'price'] = additional['price'].str.replace('$', '').str.replace('.', '').str.replace(',', '').astype(float)

In [65]:
additional = pd.merge(merged_df['id'], additional, on='id', how='inner')

In [67]:
# Concatenate the numerical features with tdidf matrix
final_df = pd.concat([additional, tfidf_df], axis=1)

In [68]:
final_df.shape

(558, 59971)

In [69]:
final_df.dropna(inplace = True)

### K-means clustering

In [70]:
# K-means clustering
num_clusters = 2  # Specify the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(final_df)

# Get cluster labels
cluster_labels = kmeans.labels_

In [71]:
true_labels = host_desc[['id','host_is_superhost']]
true_labels = pd.merge(final_df, true_labels, on='id', how='inner')
true_labels = true_labels['host_is_superhost'].tolist()

In [72]:
comparison_df = pd.DataFrame({
    'ID': final_df['id'],  # Assuming the ID is the DataFrame index
    'True_Label': true_labels,
    'Cluster_Label': cluster_labels
})

In [73]:
comparison_df.head()

Unnamed: 0,ID,True_Label,Cluster_Label
1,3831,f,0
2,5178,f,0
4,5803,t,0
5,6990,f,0
6,7064,f,0


In [74]:
# Confusion matrix

comparison_df.loc[comparison_df['True_Label'] == 't', 'True_Label'] = 1
comparison_df.loc[comparison_df['True_Label'] == 'f', 'True_Label'] = 0

confusion = pd.crosstab(comparison_df['Cluster_Label'], comparison_df['True_Label'])
print(confusion)

True_Label      0   1
Cluster_Label        
0              96  82
1              83  65


In [75]:
# Calculate accuracy
correct_predictions = (comparison_df['True_Label'] == comparison_df['Cluster_Label']).sum()
total_predictions = len(comparison_df)
accuracy = correct_predictions / total_predictions

# Calculate recall
true_positives = ((comparison_df['True_Label'] == 1) & (comparison_df['Cluster_Label'] == 1)).sum()
actual_positives = (comparison_df['True_Label'] == 1).sum()
recall = true_positives / actual_positives

# Calculate precision
true_positives = ((comparison_df['True_Label'] == 1) & (comparison_df['Cluster_Label'] == 1)).sum()
predicted_positives = (comparison_df['Cluster_Label'] == 1).sum()
precision = true_positives / predicted_positives

# Calculate F1-score
precision = true_positives / predicted_positives
recall = true_positives / actual_positives
f1_score = 2 * (precision * recall) / (precision + recall)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1_score:.3f}")

Accuracy: 0.494
Precision: 0.439
Recall: 0.442
F1-score: 0.441


### Trying Naive Bayes on TDIDF matrix

In [98]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


X = final_df
y = true_labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

label_map = {'f': 0, 't': 1}
y_test_mapped = [label_map[label] for label in y_test]
y_pred_mapped = [label_map[label] for label in y_pred]

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
precision = precision_score(y_test_mapped, y_pred_mapped)
recall = recall_score(y_test_mapped, y_pred_mapped)
f1 = f1_score(y_test_mapped, y_pred_mapped)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.6060606060606061
Precision: 0.6129032258064516
Recall: 0.5757575757575758
F1-score: 0.59375


### Logistic Regression

In [104]:
#logistic regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(penalty='l1',solver='liblinear')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


label_map = {'f': 0, 't': 1}
y_test_mapped = [label_map[label] for label in y_test]
y_pred_mapped = [label_map[label] for label in y_pred]

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
precision = precision_score(y_test_mapped, y_pred_mapped)
recall = recall_score(y_test_mapped, y_pred_mapped)
f1 = f1_score(y_test_mapped, y_pred_mapped)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.696969696969697
Precision: 0.8823529411764706
Recall: 0.45454545454545453
F1-score: 0.6


### Random Forest

In [78]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=500, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)


label_map = {'f': 0, 't': 1}
y_test_mapped = [label_map[label] for label in y_test]
y_pred_mapped = [label_map[label] for label in y_pred]


accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
precision = precision_score(y_test_mapped, y_pred_mapped)
recall = recall_score(y_test_mapped, y_pred_mapped)
f1 = f1_score(y_test_mapped, y_pred_mapped)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.6060606060606061
Precision: 0.8181818181818182
Recall: 0.2727272727272727
F1-score: 0.4090909090909091


### Word2Vec

In [79]:
from gensim.models import Word2Vec

In [80]:
documents = merged_df['text']
tokenized_docs = [doc.split() for doc in documents]

In [83]:
embedding_size = 100  
word2vec_model = Word2Vec(sentences=tokenized_docs, min_count=3,vector_size = 100, window = 5)

#size: number of dimensions of the embeddings, default is 100.
#window: The maximum distance between a target word and words around the target word, default window is 5.
#min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.
#workers: The number of partitions during training and the default workers is 3.
#sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.



In [84]:
X = []
for doc in tokenized_docs:
    doc_embedding = [word2vec_model.wv[word] for word in doc if word in word2vec_model.wv]
    if doc_embedding:
        doc_avg_embedding = sum(doc_embedding) / len(doc_embedding)  # Averaging word embeddings
        X.append(doc_avg_embedding)
    else:
        X.append([0] * embedding_size)

In [85]:
y = merged_df['host_is_superhost']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic Regression

In [86]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)


y_pred = classifier.predict(X_test)

# Map the label values in y_test and y_pred
label_map = {'f': 0, 't': 1}
y_test_mapped = [label_map[label] for label in y_test]
y_pred_mapped = [label_map[label] for label in y_pred]

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
precision = precision_score(y_test_mapped, y_pred_mapped)
recall = recall_score(y_test_mapped, y_pred_mapped)
f1 = f1_score(y_test_mapped, y_pred_mapped)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.7142857142857143
Precision: 0.6666666666666666
Recall: 0.2702702702702703
F1-score: 0.3846153846153846


### Random Forest

In [88]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)


label_map = {'f': 0, 't': 1}
y_test_mapped = [label_map[label] for label in y_test]
y_pred_mapped = [label_map[label] for label in y_pred]


accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
precision = precision_score(y_test_mapped, y_pred_mapped)
recall = recall_score(y_test_mapped, y_pred_mapped)
f1 = f1_score(y_test_mapped, y_pred_mapped)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.6875
Precision: 0.55
Recall: 0.2972972972972973
F1-score: 0.38596491228070173
