In [299]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from pathlib import Path
from collections import Counter



In [300]:
columns = [
   "rating_good_or_bad","accept_rate_below_avg","accept_rate_above_avg","response_rate_below_avg","response_rate_above_avg","below_avg_num_of_reviews",
   "above_avg_num_of_reviews","below_avg_num_of_accom","above_avg_num_of_accom","below_avg_num_of_bedrooms","above_avg_num_of_bedrooms","below_avg_num_of_beds",
   "above_avg_num_of_beds","below_avg_num_of_baths","above_avg_num_of_baths","not_a_superhost","a_superhost","host_does_not_have_profile_pic",
   "host_does_have_profile_pic","host_does_not_have_identity_ver","host_does_have_identity_ver","not_instantly_bookable","instantly_bookable",
   "does_not_require_guest_prof_pic","requires_guest_profile_pic","does_not_require_guest_phone_verification","requires_guest_phone_verification",
   "rt_few_days","rt_within_day","rt_few_hours","rt_within_hours","location_ballard","location_beacon_hill","location_capital_hill",
   "location_cascade","location_central_area","location_delridge","location_downtown","location_interbay","location_lake_city","location_magnolia",
   "location_northgate","location_other","location_queen_anne","location_rainier_vally","location_seward_park","location_university_district",
   "location_west_seattle","pt_apartment","pt_b&b","pt_boat","pt_bungalow","pt_cabin","pt_rv","pt_chalet","pt_condo","pt_dorm","pt_house","pt_loft",
   "pt_other","pt_tent","pt_townhouse","pt_treehouse","pt_yurt","rt_entire_home","rt_private_room","rt_shared_room","bt_air_bed","bt_couch",
   "bt_futon","bt_pull_out_sofa","bt_real_bed","cp_flexible","cp_moderate","cp_strict"
]

# columns = [
# "id","host_id","booking_id","name","neighbourhood_group","city","state","zipcode","latitude","longitude","property_type","room_type",
# "accommodates","bathrooms","bedrooms","beds","bed_type","instant_bookable","cancellation_policy","require_guest_profile_picture","require_guest_phone_verification",
# "number_of_reviews","first_review","last_review","review_scores_rating","review_scores_accuracy","review_scores_cleanliness","review_scores_checkin","review_scores_communication",
# "review_scores_location","review_scores_value","reviews_per_month","host_name","host_since","host_listings_count","host_location","host_response_time","host_response_rate",
# "host_acceptance_rate","host_is_superhost","host_has_profile_pic","host_identity_verified","rating_good_or_bad","num_of_reivews_gt_avg","num_of_accom_gt_avg",
# "num_of_bedrooms_gt_avg","num_of_beds_gt_avg","num_of_baths_gt_avg","response_rate_gt_avg","accept_rate_gt_avg","accept_rate_below_avg","accept_rate_above_avg",
# "response_rate_below_avg","response_rate_above_avg","below_avg_num_of_reviews","above_avg_num_of_reviews","below_avg_num_of_accom","above_avg_num_of_accom",
# "below_avg_num_of_bedrooms","above_avg_num_of_bedrooms","below_avg_num_of_beds","above_avg_num_of_beds","below_avg_num_of_baths","above_avg_num_of_baths",
# "not_a_superhost","a_superhost","host_does_not_have_profile_pic","host_does_have_profile_pic","host_does_not_have_identity_ver","host_does_have_identity_ver",
# "not_instantly_bookable","instantly_bookable","does_not_require_guest_prof_pic","requires_guest_profile_pic","does_not_require_guest_phone_verification",
# "requires_guest_phone_verification","rt_few_days","rt_within_day","rt_few_hours","rt_within_hours","location_ballard","location_beacon_hill","location_capital_hill",
# "location_cascade","location_central_area","location_delridge","location_downtown","location_interbay","location_lake_city","location_magnolia","location_northgate",
# "location_other","location_queen_anne","location_rainier_vally","location_seward_park","location_university_district","location_west_seattle","pt_apartment",
# "pt_b&b","pt_boat","pt_bungalow","pt_cabin","pt_rv","pt_chalet","pt_condo","pt_dorm","pt_house","pt_loft","pt_other","pt_tent","pt_townhouse","pt_treehouse",
# "pt_yurt","rt_entire_home","rt_private_room","rt_shared_room","bt_air_bed","bt_couch","bt_futon","bt_pull_out_sofa","bt_real_bed","cp_flexible","cp_moderate","cp_strict"
# ]


In [301]:
# Load the data
file_path = Path('ml_final_data.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()


# convert interest rate to numerical
# df['host_response_rate'] = df['host_response_rate'].str.replace('%', '')
# df['host_response_rate'] = df['host_response_rate'].astype('float') / 100
# df['host_acceptance_rate'] = df['host_acceptance_rate'].str.replace('%', '')
# df['host_acceptance_rate'] = df['host_acceptance_rate'].astype('float') / 100

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,rating_good_or_bad,accept_rate_below_avg,accept_rate_above_avg,response_rate_below_avg,response_rate_above_avg,below_avg_num_of_reviews,above_avg_num_of_reviews,below_avg_num_of_accom,above_avg_num_of_accom,below_avg_num_of_bedrooms,...,rt_private_room,rt_shared_room,bt_air_bed,bt_couch,bt_futon,bt_pull_out_sofa,bt_real_bed,cp_flexible,cp_moderate,cp_strict
0,good_review,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,good_review,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,good_review,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,bad_review,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,good_review,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [302]:
conditions = [
    (df['rating_good_or_bad'] == 'bad_review'),
    (df['rating_good_or_bad'] >= 'good_review')]

# create a list of the values we want to assign for each condition
values = [0, 1,]

# create a new column and use np.select to assign values to it using our lists as arguments
df['score_cutoff'] = np.select(conditions, values)

df = df.drop(["rating_good_or_bad"], axis=1)

# display updated DataFrame
df.head()

Unnamed: 0,accept_rate_below_avg,accept_rate_above_avg,response_rate_below_avg,response_rate_above_avg,below_avg_num_of_reviews,above_avg_num_of_reviews,below_avg_num_of_accom,above_avg_num_of_accom,below_avg_num_of_bedrooms,above_avg_num_of_bedrooms,...,rt_shared_room,bt_air_bed,bt_couch,bt_futon,bt_pull_out_sofa,bt_real_bed,cp_flexible,cp_moderate,cp_strict,score_cutoff
0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
1,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
2,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
3,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
4,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1


In [303]:
#Check review scores
review_scores = df['score_cutoff'].value_counts()
review_scores

1    2462
0     189
Name: score_cutoff, dtype: int64

In [304]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

In [305]:
#Try out a neural network
# Split our preprocessed data into our features and target arrays(training and testing)
y = df["score_cutoff"].values
X = df.drop(["score_cutoff"], axis = 1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

print(X)
#print(y)

[[0. 1. 0. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 0. 1.]
 ...
 [0. 1. 0. ... 1. 0. 0.]
 [0. 1. 0. ... 1. 0. 0.]
 [0. 1. 0. ... 1. 0. 0.]]


In [306]:
# Create a StandardScaler instance.  Must scale after trian and test has been established.
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [307]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 3
hidden_nodes_layer2 = 5
hidden_nodes_layer3 = 1
hidden_nodes_layer4 = 1


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))



# third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="tanh"))


# forth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="tanh"))




# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_108 (Dense)           (None, 3)                 225       
                                                                 
 dense_109 (Dense)           (None, 5)                 20        
                                                                 
 dense_110 (Dense)           (None, 1)                 6         
                                                                 
 dense_111 (Dense)           (None, 1)                 2         
                                                                 
 dense_112 (Dense)           (None, 1)                 2         
                                                                 
Total params: 255
Trainable params: 255
Non-trainable params: 0
_________________________________________________________________


In [308]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [309]:
# Train the model
model_fit = nn.fit(X_train_scaled,y_train,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [310]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

21/21 - 0s - loss: 0.2666 - accuracy: 0.9246 - 112ms/epoch - 5ms/step
Loss: 0.26662537455558777, Accuracy: 0.924585223197937


In [311]:
df2 = pd.read_csv("test.csv")

predictions = nn.predict(df2)

rounded_predictions = np.round(predictions)



In [313]:
# Create a new DataFrame with the predictions and set its index to be the same as the input DataFrame
predictions_df = pd.DataFrame(rounded_predictions, index=df2.index, columns=["Prediction"])

# Merge the predictions_df back to the original df
merged_df = pd.concat([df2, predictions_df], axis=1)

merged_df.to_csv('results.csv', index= False)

merged_df


Unnamed: 0,accept_rate_below_avg,accept_rate_above_avg,response_rate_below_avg,response_rate_above_avg,below_avg_num_of_reviews,above_avg_num_of_reviews,below_avg_num_of_accom,above_avg_num_of_accom,below_avg_num_of_bedrooms,above_avg_num_of_bedrooms,...,rt_shared_room,bt_air_bed,bt_couch,bt_futon,bt_pull_out_sofa,bt_real_bed,cp_flexible,cp_moderate,cp_strict,Prediction
0,0,1,0,1,0,1,1,0,1,0,...,1,0,0,1,0,0,0,0,1,1.0
1,0,1,0,1,1,0,1,0,1,0,...,1,0,0,1,0,0,0,0,1,1.0
2,0,1,0,1,1,0,1,0,1,0,...,0,0,0,0,0,1,0,0,1,1.0
3,0,1,0,1,1,0,1,0,1,0,...,1,0,0,1,0,0,0,0,1,1.0
4,0,1,0,1,1,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2646,0,1,0,1,1,0,0,1,1,0,...,0,0,0,0,0,1,1,0,0,1.0
2647,0,1,0,1,1,0,0,1,0,1,...,0,0,0,0,0,1,1,0,0,1.0
2648,0,1,0,1,1,0,1,0,1,0,...,0,0,0,0,0,1,1,0,0,1.0
2649,0,1,0,1,1,0,0,1,0,1,...,0,0,0,0,0,1,1,0,0,1.0
