In [None]:
import os

import pandas as pd

reviews = pd.read_csv('new_reviews.csv')
reviews.head()

In [None]:
reviews = reviews.drop(['Unnamed: 0'], axis=1)
reviews['comments'] = reviews['0']
reviews = reviews.drop(['0'], axis=1)
reviews = reviews.dropna(subset=['comments'])
reviews

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
import nltk

nltk.download('stopwords')

In [None]:
stemmer = PorterStemmer()


def list_to_str_include_stem(l):
    res = ''
    for s in l:
        res = res + ' ' + stemmer.stem(s)
    return res

In [None]:
# Remove punctuation and stemmerize
tokenizer = CountVectorizer().build_tokenizer()
reviews['comments'] = reviews['comments'].apply(lambda x: list_to_str_include_stem(tokenizer(x)), 1)
reviews

In [None]:
# Vectorize
stop_words = nltk.corpus.stopwords.words('english')
vectorizer = CountVectorizer(stop_words=stop_words, max_df=0.5, min_df=0.05)
X = vectorizer.fit_transform(reviews['comments'])
words_list = vectorizer.get_feature_names()
print(words_list)
words_array = X.toarray()
print(words_array)
print(len(words_array[0]))

In [None]:
from pandas import DataFrame

words_df = DataFrame(words_array)
words_df.columns = words_list
words_df

In [None]:
result = pd.concat([reviews['listing_id'], words_df], axis=1)
result.dropna()

In [None]:
result = result.groupby(by='listing_id')
result = result.agg('mean')
result

In [None]:
result = result.reset_index()
result

In [None]:
result.fillna(0)

In [None]:
result.rename(columns={'listing_id': 'id'}, inplace=True)
result

In [None]:
listings = pd.read_csv('new_listings1.csv')
listings.__len__()

In [None]:
listings['id'] = listings['id'].astype('float64')
final = pd.merge(listings, result, on=['id'])
final

In [None]:
final.columns.values.tolist()

In [None]:
final = final.drop(['Unnamed: 0'], axis=1)
final

In [None]:
order = ['review_scores_rating',
 'review_scores_value',
 'review_scores_checkin',
 'review_scores_accuracy',
 'review_scores_location',
 'review_scores_cleanliness',
 'review_scores_communication',
 'id',
 'price',
 'availability_30',
 'number_of_reviews',
 'first_review',
 'reviews_per_month',
 'host_response_time_a few days or more',
 'host_response_time_unknown',
 'host_response_time_within a day',
 'host_response_time_within a few hours',
 'host_response_time_within an hour',
 'host_response_rate_0%-69%',
 'host_response_rate_70%-79%',
 'host_response_rate_80%-89%',
 'host_response_rate_90%-98%',
 'host_response_rate_99%-100%',
 'host_response_rate_unknown',
 'host_acceptance_rate_0%-69%',
 'host_acceptance_rate_70%-79%',
 'host_acceptance_rate_80%-89%',
 'host_acceptance_rate_90%-96%',
 'host_acceptance_rate_97%-98%',
 'host_acceptance_rate_99%-100%',
 'host_acceptance_rate_unknown',
 'host_is_superhost_f',
 'host_is_superhost_t',
 'calculated_host_listings_count_1',
 'calculated_host_listings_count_10-29',
 'calculated_host_listings_count_2-4',
 'calculated_host_listings_count_5-9',
 'calculated_host_listings_count_>29',
 'host_identity_verified_f',
 'host_identity_verified_t',
 'neighbourhood_cleansed_Dn Laoghaire-Rathdown',
 'neighbourhood_cleansed_Dublin City',
 'neighbourhood_cleansed_Fingal',
 'neighbourhood_cleansed_South Dublin',
 'property_type_Entire condo',
 'property_type_Entire cottage',
 'property_type_Entire guest suite',
 'property_type_Entire guesthouse',
 'property_type_Entire home',
 'property_type_Entire rental unit',
 'property_type_Entire serviced apartment',
 'property_type_Entire townhouse',
 'property_type_Private room in bed and breakfast',
 'property_type_Private room in condo',
 'property_type_Private room in home',
 'property_type_Private room in rental unit',
 'property_type_Private room in townhouse',
 'property_type_Shared room in home',
 'property_type_Shared room in rental unit',
 'property_type_others',
 'room_type_Entire home/apt',
 'room_type_Hotel room',
 'room_type_Private room',
 'room_type_Shared room',
 'accommodates_1',
 'accommodates_2',
 'accommodates_3',
 'accommodates_4',
 'accommodates_5',
 'accommodates_>5',
 'bathrooms_text_1 bath',
 'bathrooms_text_1 private bath',
 'bathrooms_text_1 shared bath',
 'bathrooms_text_1.5 baths',
 'bathrooms_text_1.5 shared baths',
 'bathrooms_text_2 baths',
 'bathrooms_text_2 shared baths',
 'bathrooms_text_2.5 baths',
 'bathrooms_text_3 baths',
 'bathrooms_text_others',
 'bedrooms_1',
 'bedrooms_2',
 'bedrooms_3',
 'bedrooms_4',
 'bedrooms_5',
 'bedrooms_>5',
 'bedrooms_unknown',
 'beds_1',
 'beds_2',
 'beds_3',
 'beds_4',
 'beds_5',
 'beds_6',
 'beds_>6',
 'beds_unknown',
 'minimum_nights_1',
 'minimum_nights_2',
 'minimum_nights_3',
 'minimum_nights_4',
 'minimum_nights_5',
 'minimum_nights_6',
 'minimum_nights_7-33',
 'minimum_nights_>33',
 'instant_bookable_f',
 'instant_bookable_t',
 'amenities_Hot water kettle',
 'amenities_Outdoor furniture',
 'amenities_Dining table',
 'amenities_Indoor fireplace',
 'amenities_Breakfast',
 'amenities_Central heating',
 'amenities_Cleaning products',
 'amenities_Shower gel',
 'amenities_Lock on bedroom door',
 'amenities_Dishwasher',
 'amenities_Freezer',
 'amenities_Free street parking',
 'amenities_Bathtub',
 'amenities_Coffee maker',
 'amenities_Conditioner',
 'amenities_Body soap',
 'amenities_Toaster',
 'amenities_Lockbox',
 'amenities_Room-darkening shades',
 'amenities_Outdoor dining area',
 'amenities_Wine glasses',
 'amenities_Extra pillows and blankets',
 'amenities_Luggage dropoff allowed',
 'amenities_TV with standard cable',
 'amenities_Cable TV',
 'amenities_Private patio or balcony',
 'amenities_Stove',
 'amenities_Laundromat nearby',
 'amenities_Drying rack for clothing',
 'amenities_Backyard',
 'amenities_Host greets you',
 'amenities_Paid parking off premises',
 'amenities_Security cameras on property',
 'amenities_Private entrance',
 'amenities_Dedicated workspace',
 'amenities_Patio or balcony',
 'amenities_Elevator'] + words_list
final = final[order]
final = final.drop(['id'], axis=1)
final.columns.values.tolist()

In [None]:
os.remove('final_features.csv')
final.to_csv('final_features.csv')