In [141]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from pathlib import Path
from collections import Counter



In [142]:
columns = [
   "rating_good_or_bad","accept_rate_below_avg","accept_rate_above_avg","response_rate_below_avg","response_rate_above_avg","below_avg_num_of_reviews",
   "above_avg_num_of_reviews","below_avg_num_of_accom","above_avg_num_of_accom","below_avg_num_of_bedrooms","above_avg_num_of_bedrooms","below_avg_num_of_beds",
   "above_avg_num_of_beds","below_avg_num_of_baths","above_avg_num_of_baths","not_a_superhost","a_superhost","host_does_not_have_profile_pic",
   "host_does_have_profile_pic","host_does_not_have_identity_ver","host_does_have_identity_ver","not_instantly_bookable","instantly_bookable",
   "does_not_require_guest_prof_pic","requires_guest_profile_pic","does_not_require_guest_phone_verification","requires_guest_phone_verification",
   "rt_few_days","rt_within_day","rt_few_hours","rt_within_hours","location_ballard","location_beacon_hill","location_capital_hill",
   "location_cascade","location_central_area","location_delridge","location_downtown","location_interbay","location_lake_city","location_magnolia",
   "location_northgate","location_other","location_queen_anne","location_rainier_vally","location_seward_park","location_university_district",
   "location_west_seattle","pt_apartment","pt_b&b","pt_boat","pt_bungalow","pt_cabin","pt_rv","pt_chalet","pt_condo","pt_dorm","pt_house","pt_loft",
   "pt_other","pt_tent","pt_townhouse","pt_treehouse","pt_yurt","rt_entire_home","rt_private_room","rt_shared_room","bt_air_bed","bt_couch",
   "bt_futon","bt_pull_out_sofa","bt_real_bed","cp_flexible","cp_moderate","cp_strict"
]

# columns = [
# "id","host_id","booking_id","name","neighbourhood_group","city","state","zipcode","latitude","longitude","property_type","room_type",
# "accommodates","bathrooms","bedrooms","beds","bed_type","instant_bookable","cancellation_policy","require_guest_profile_picture","require_guest_phone_verification",
# "number_of_reviews","first_review","last_review","review_scores_rating","review_scores_accuracy","review_scores_cleanliness","review_scores_checkin","review_scores_communication",
# "review_scores_location","review_scores_value","reviews_per_month","host_name","host_since","host_listings_count","host_location","host_response_time","host_response_rate",
# "host_acceptance_rate","host_is_superhost","host_has_profile_pic","host_identity_verified","rating_good_or_bad","num_of_reivews_gt_avg","num_of_accom_gt_avg",
# "num_of_bedrooms_gt_avg","num_of_beds_gt_avg","num_of_baths_gt_avg","response_rate_gt_avg","accept_rate_gt_avg","accept_rate_below_avg","accept_rate_above_avg",
# "response_rate_below_avg","response_rate_above_avg","below_avg_num_of_reviews","above_avg_num_of_reviews","below_avg_num_of_accom","above_avg_num_of_accom",
# "below_avg_num_of_bedrooms","above_avg_num_of_bedrooms","below_avg_num_of_beds","above_avg_num_of_beds","below_avg_num_of_baths","above_avg_num_of_baths",
# "not_a_superhost","a_superhost","host_does_not_have_profile_pic","host_does_have_profile_pic","host_does_not_have_identity_ver","host_does_have_identity_ver",
# "not_instantly_bookable","instantly_bookable","does_not_require_guest_prof_pic","requires_guest_profile_pic","does_not_require_guest_phone_verification",
# "requires_guest_phone_verification","rt_few_days","rt_within_day","rt_few_hours","rt_within_hours","location_ballard","location_beacon_hill","location_capital_hill",
# "location_cascade","location_central_area","location_delridge","location_downtown","location_interbay","location_lake_city","location_magnolia","location_northgate",
# "location_other","location_queen_anne","location_rainier_vally","location_seward_park","location_university_district","location_west_seattle","pt_apartment",
# "pt_b&b","pt_boat","pt_bungalow","pt_cabin","pt_rv","pt_chalet","pt_condo","pt_dorm","pt_house","pt_loft","pt_other","pt_tent","pt_townhouse","pt_treehouse",
# "pt_yurt","rt_entire_home","rt_private_room","rt_shared_room","bt_air_bed","bt_couch","bt_futon","bt_pull_out_sofa","bt_real_bed","cp_flexible","cp_moderate","cp_strict"
# ]


In [143]:
# Load the data
file_path = Path('ml_final_data.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()


# convert interest rate to numerical
# df['host_response_rate'] = df['host_response_rate'].str.replace('%', '')
# df['host_response_rate'] = df['host_response_rate'].astype('float') / 100
# df['host_acceptance_rate'] = df['host_acceptance_rate'].str.replace('%', '')
# df['host_acceptance_rate'] = df['host_acceptance_rate'].astype('float') / 100

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,rating_good_or_bad,accept_rate_below_avg,accept_rate_above_avg,response_rate_below_avg,response_rate_above_avg,below_avg_num_of_reviews,above_avg_num_of_reviews,below_avg_num_of_accom,above_avg_num_of_accom,below_avg_num_of_bedrooms,...,rt_private_room,rt_shared_room,bt_air_bed,bt_couch,bt_futon,bt_pull_out_sofa,bt_real_bed,cp_flexible,cp_moderate,cp_strict
0,good_review,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,good_review,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,good_review,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,bad_review,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,good_review,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [144]:
conditions = [
    (df['rating_good_or_bad'] == 'bad_review'),
    (df['rating_good_or_bad'] >= 'good_review')]

# create a list of the values we want to assign for each condition
values = [0, 1,]

# create a new column and use np.select to assign values to it using our lists as arguments
df['score_cutoff'] = np.select(conditions, values)

df = df.drop(["rating_good_or_bad"], axis=1)

# display updated DataFrame
df.head()

Unnamed: 0,accept_rate_below_avg,accept_rate_above_avg,response_rate_below_avg,response_rate_above_avg,below_avg_num_of_reviews,above_avg_num_of_reviews,below_avg_num_of_accom,above_avg_num_of_accom,below_avg_num_of_bedrooms,above_avg_num_of_bedrooms,...,rt_shared_room,bt_air_bed,bt_couch,bt_futon,bt_pull_out_sofa,bt_real_bed,cp_flexible,cp_moderate,cp_strict,score_cutoff
0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
1,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
2,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
3,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
4,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1


In [145]:
#Check review scores
review_scores = df['score_cutoff'].value_counts()
review_scores

1    2462
0     189
Name: score_cutoff, dtype: int64

In [146]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

In [147]:
#Try out a neural network
# Split our preprocessed data into our features and target arrays(training and testing)
y = df["score_cutoff"].values
X = df.drop(["score_cutoff"], axis = 1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

print(X)
#print(y)

[[0. 1. 0. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 0. 1.]
 ...
 [0. 1. 0. ... 1. 0. 0.]
 [0. 1. 0. ... 1. 0. 0.]
 [0. 1. 0. ... 1. 0. 0.]]


In [148]:
# Create a StandardScaler instance.  Must scale after trian and test has been established.
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [149]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 7
hidden_nodes_layer2 = 9
hidden_nodes_layer3 = 9
hidden_nodes_layer4 = 5
hidden_nodes_layer5 = 7
hidden_nodes_layer6 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))



# third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="tanh"))


# forth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="tanh"))

# fifth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="tanh"))

# 6th hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer6, activation="tanh"))



# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_46 (Dense)            (None, 7)                 525       
                                                                 
 dense_47 (Dense)            (None, 9)                 72        
                                                                 
 dense_48 (Dense)            (None, 9)                 90        
                                                                 
 dense_49 (Dense)            (None, 5)                 50        
                                                                 
 dense_50 (Dense)            (None, 7)                 42        
                                                                 
 dense_51 (Dense)            (None, 5)                 40        
                                                                 
 dense_52 (Dense)            (None, 1)                

In [150]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [151]:
# Train the model
model_fit = nn.fit(X_train_scaled,y_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [152]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

21/21 - 0s - loss: 0.3142 - accuracy: 0.8989 - 122ms/epoch - 6ms/step
Loss: 0.314196914434433, Accuracy: 0.8989441990852356


In [153]:
nn.save("model")
nn.save("model2.h5")
df.to_csv('input_test.csv', index= False)

INFO:tensorflow:Assets written to: model/assets


In [154]:
predictions= nn.predict(x=X_test, batch_size=100, verbose=0)

In [155]:
for i in predictions:
    print(i)

[0.9826272]
[0.923408]
[0.98323274]
[0.98408526]
[0.97999847]
[0.9814517]
[0.9841668]
[0.9834918]
[0.9841709]
[0.9807275]
[0.98048234]
[0.9828599]
[0.9831507]
[0.9808882]
[0.9796637]
[0.98349786]
[0.9820031]
[0.89310175]
[0.98204386]
[0.9836436]
[0.9806873]
[0.98326945]
[0.9842224]
[0.97650534]
[0.98064286]
[0.9798344]
[0.9803288]
[0.983953]
[0.97673947]
[0.98331624]
[0.9801063]
[0.9826261]
[0.97879213]
[0.9803833]
[0.97830606]
[0.98163956]
[0.9792345]
[0.9839055]
[0.94254494]
[0.98312]
[0.969061]
[0.9821326]
[0.96862584]
[0.97586226]
[0.9785726]
[0.9830523]
[0.96775675]
[0.95917934]
[0.9818829]
[0.9828939]
[0.98244274]
[0.9832658]
[0.9815876]
[0.9822831]
[0.98289186]
[0.9834255]
[0.98075116]
[0.98315954]
[0.9546074]
[0.9811369]
[0.98376304]
[0.97286135]
[0.98297554]
[0.97943676]
[0.94778323]
[0.98321533]
[0.98307705]
[0.9832158]
[0.9552157]
[0.9829406]
[0.9834497]
[0.9793574]
[0.9840774]
[0.9786835]
[0.9761433]
[0.9762109]
[0.98290944]
[0.9687697]
[0.9636018]
[0.9805534]
[0.9499044]
[

In [156]:
rounded_predictions = np.argmax(predictions, axis=-1)

In [157]:
for i in rounded_predictions:
    print(i)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [158]:
loaded_model = tf.keras.models.load_model('model')
# Check its architecture
loaded_model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_46 (Dense)            (None, 7)                 525       
                                                                 
 dense_47 (Dense)            (None, 9)                 72        
                                                                 
 dense_48 (Dense)            (None, 9)                 90        
                                                                 
 dense_49 (Dense)            (None, 5)                 50        
                                                                 
 dense_50 (Dense)            (None, 7)                 42        
                                                                 
 dense_51 (Dense)            (None, 5)                 40        
                                                                 
 dense_52 (Dense)            (None, 1)                

In [159]:
# Load the data
file_path = Path('input_test.csv')

df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

df = df.drop(["rating_good_or_bad"], axis=1)

ans = nn.predict(df)
print(ans)

KeyError: "['rating_good_or_bad'] not in index"