In [253]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [254]:
columns = [
    "cp_strict","cp_moderate","cp_flexible","bt_real_bed","bt_pull_out_sofa","bt_futon","bt_couch","bt_air_bed",
    "rt_shared_room","rt_private_room","rt_entire_home","pt_yurt","pt_treehouse","pt_townhouse","pt_tent","pt_other","pt_loft",
    "pt_house","pt_dorm","pt_condo","pt_chalet","pt_rv","pt_cabin","pt_bungalow","pt_boat","pt_b&b","pt_apartment","location_west_seattle",
    "location_university_district","location_seward_park","location_rainier_vally","location_queen_anne","location_other","location_northgate",
    "location_magnolia","location_lake_city","location_interbay","location_downtown","location_delridge","location_central_area","location_cascade",
    "location_capital_hill","location_beacon_hill","location_ballard","rt_within_hours","rt_few_hours","rt_within_day","rt_few_days",
    "requires_guest_phone_verification","does_not_require_guest_phone_verification","requires_guest_profile_pic","does_not_require_guest_prof_pic",
    "instantly_bookable","not_instantly_bookable","host_does_not_require_license","host_does_have_identity_ver","host_does_not_have_identity_ver",
    "host_does_have_profile_pic","host_does_not_have_profile_pic","host_response_time","host_response_rate",
    "host_acceptance_rate","rating_good_or_bad"
]

# columns = [
#     "cp_strict","cp_moderate","cp_flexible","bt_real_bed","bt_pull_out_sofa","bt_futon","bt_couch","bt_air_bed",
#     "rt_shared_room","rt_private_room","rt_entire_home","pt_yurt","pt_treehouse","pt_townhouse","pt_tent","pt_other","pt_loft",
#     "pt_house","pt_dorm","pt_condo","pt_chalet","pt_rv","pt_cabin","pt_bungalow","pt_boat","pt_b&b","pt_apartment","location_west_seattle",
#     "location_university_district","location_seward_park","location_rainier_vally","location_queen_anne","location_other","location_northgate",
#     "location_magnolia","location_lake_city","location_interbay","location_downtown","location_delridge","location_central_area","location_cascade",
#     "location_capital_hill","location_beacon_hill","location_ballard","rt_within_hours","rt_few_hours","rt_within_day","rt_few_days",
#     "requires_guest_phone_verification","does_not_require_guest_phone_verification","requires_guest_profile_pic","does_not_require_guest_prof_pic",
#     "instantly_bookable","not_instantly_bookable","host_does_not_require_license","host_does_have_identity_ver","host_does_not_have_identity_ver",
#     "host_does_have_profile_pic","host_does_not_have_profile_pic","a_superhost","not_a_superhost","host_response_time","host_response_rate",
#     "host_acceptance_rate","host_is_superhost","host_has_profile_pic","host_identity_verified","neighbourhood_group_cleansed","property_type",
#     "room_type","accommodates","bathrooms","bedrooms","beds","bed_type","price","guests_included","extra_people","minimum_nights","maximum_nights",
#     "availability_30","availability_60","availability_90","availability_365","requires_license","instant_bookable","cancellation_policy",
#     "require_guest_profile_picture","require_guest_phone_verification","rating_good_or_bad"
# ]

target = ["rating_good_or_bad"]

In [255]:
# Load the data
file_path = Path('ml_final_data.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()


# convert interest rate to numerical
df['host_response_rate'] = df['host_response_rate'].str.replace('%', '')
df['host_response_rate'] = df['host_response_rate'].astype('float') / 100
df['host_acceptance_rate'] = df['host_acceptance_rate'].str.replace('%', '')
df['host_acceptance_rate'] = df['host_acceptance_rate'].astype('float') / 100

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,cp_strict,cp_moderate,cp_flexible,bt_real_bed,bt_pull_out_sofa,bt_futon,bt_couch,bt_air_bed,rt_shared_room,rt_private_room,...,not_instantly_bookable,host_does_not_require_license,host_does_have_identity_ver,host_does_not_have_identity_ver,host_does_have_profile_pic,host_does_not_have_profile_pic,host_response_time,host_response_rate,host_acceptance_rate,rating_good_or_bad
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,within a few hours,0.96,1.0,good_review
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,within an hour,0.98,1.0,good_review
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,within a few hours,0.67,1.0,good_review
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,within an hour,1.0,1.0,good_review
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,within an hour,1.0,1.0,good_review


In [256]:
# Create our features
X = df.drop("rating_good_or_bad",axis=1)
X = pd.get_dummies(X)


# Create our target
y = df.loc[:,target].copy()

In [257]:
X.describe()

Unnamed: 0,cp_strict,cp_moderate,cp_flexible,bt_real_bed,bt_pull_out_sofa,bt_futon,bt_couch,bt_air_bed,rt_shared_room,rt_private_room,...,host_does_have_identity_ver,host_does_not_have_identity_ver,host_does_have_profile_pic,host_does_not_have_profile_pic,host_response_rate,host_acceptance_rate,host_response_time_a few days or more,host_response_time_within a day,host_response_time_within a few hours,host_response_time_within an hour
count,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0,...,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0
mean,0.40904,0.355932,0.235028,0.961205,0.013183,0.017326,0.002637,0.00565,0.033522,0.297175,...,0.831638,0.168362,0.99887,0.00113,0.953537,0.999623,0.009416,0.147646,0.290772,0.552166
std,0.491749,0.478885,0.424096,0.193142,0.114078,0.130507,0.051289,0.074966,0.180028,0.4571,...,0.374258,0.374258,0.033602,0.033602,0.108558,0.019407,0.096597,0.354816,0.454204,0.497365
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.98,1.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
75%,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [258]:
# Check the balance of our target values
y["rating_good_or_bad"].value_counts()

good_review    1493
bad_review     1162
Name: rating_good_or_bad, dtype: int64

In [259]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)
print("Training:", Counter(y_train["rating_good_or_bad"]))
print("Test:", Counter(y_test["rating_good_or_bad"]))

Training: Counter({'good_review': 1124, 'bad_review': 867})
Test: Counter({'good_review': 369, 'bad_review': 295})


Oversampling

In [260]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler (random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
print(Counter(y_resampled["rating_good_or_bad"]))

Counter({'bad_review': 1124, 'good_review': 1124})


In [261]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [262]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test,y_pred)

0.5869229709246245

In [263]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm,index=["Actual Bad Review", "Actual Good Review"], columns=["Predicted Bad Review", "Predicted Good Review"])
cm_df

Unnamed: 0,Predicted Bad Review,Predicted Good Review
Actual Bad Review,176,119
Actual Good Review,156,213


In [264]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

 bad_review       0.53      0.60      0.58      0.56      0.59      0.35       295
good_review       0.64      0.58      0.60      0.61      0.59      0.34       369

avg / total       0.59      0.59      0.59      0.59      0.59      0.34       664



SMOTE Oversampling

In [265]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
print(Counter(y_resampled["rating_good_or_bad"]))

Counter({'bad_review': 1124, 'good_review': 1124})


In [266]:
# Train the Logistic Regression model using the resampled data
modelS = LogisticRegression(solver='lbfgs', random_state=1)
modelS.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [267]:
# Calculated the balanced accuracy score
# Calculate predictions
y_pred = modelS.predict(X_test)

# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.5977584860594369

In [268]:
# Display the confusion matrix
cm2 = confusion_matrix(y_test, y_pred)
cm2_df =pd.DataFrame(cm2, index=["Actual Bad Review", "Actual Good Review"], columns=["Predicted Bad Review", "Predicted Good Review"])
cm2_df

Unnamed: 0,Predicted Bad Review,Predicted Good Review
Actual Bad Review,172,123
Actual Good Review,143,226


In [269]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

 bad_review       0.55      0.58      0.61      0.56      0.60      0.36       295
good_review       0.65      0.61      0.58      0.63      0.60      0.36       369

avg / total       0.60      0.60      0.60      0.60      0.60      0.36       664



Undersampling

In [270]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
print(Counter(y_resampled["rating_good_or_bad"]))

Counter({'bad_review': 867, 'good_review': 867})


In [271]:
# Train the Logistic Regression model using the resampled data
modelU = LogisticRegression(solver='lbfgs', random_state=1)
modelU.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [272]:
# Calculated the balanced accuracy score
y_pred = modelU.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6065913370998117

In [273]:
# Display the confusion matrix
cm3 = confusion_matrix(y_test, y_pred)
cm3_df =pd.DataFrame(cm3, index=["Actual Bad Review", "Actual Good Review"], columns=["Predicted Bad Review", "Predicted Good Review"])
cm3_df

Unnamed: 0,Predicted Bad Review,Predicted Good Review
Actual Bad Review,194,101
Actual Good Review,164,205


In [274]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

 bad_review       0.54      0.66      0.56      0.59      0.60      0.37       295
good_review       0.67      0.56      0.66      0.61      0.60      0.36       369

avg / total       0.61      0.60      0.61      0.60      0.60      0.36       664



Combo (over and under) sampling

In [275]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X,y)
print(Counter(y_resampled["rating_good_or_bad"]))

Counter({'bad_review': 405, 'good_review': 326})


In [276]:
# Train the Logistic Regression model using the resampled data
mlenn = LogisticRegression(solver='lbfgs',random_state=1)
mlenn.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [277]:
# Calculated the balanced accuracy score
y_pred = mlenn.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5957420421661844

In [278]:
# Display the confusion matrix
cm4 = confusion_matrix(y_test, y_pred)
cm4_df =pd.DataFrame(cm4, index=["Actual Bad Review", "Actual Good Review"], columns=["Predicted Bad Review", "Predicted Good Review"])
cm4_df

Unnamed: 0,Predicted Bad Review,Predicted Good Review
Actual Bad Review,186,109
Actual Good Review,162,207


In [279]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

 bad_review       0.53      0.63      0.56      0.58      0.59      0.36       295
good_review       0.66      0.56      0.63      0.60      0.59      0.35       369

avg / total       0.60      0.59      0.60      0.59      0.59      0.35       664



Random Forest

In [280]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_model.fit(X_train, y_train)
print(Counter(y_train["rating_good_or_bad"]))

Counter({'good_review': 1124, 'bad_review': 867})


In [281]:
# Calculated the balanced accuracy score
y_pred = random_model.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5848973404988287

In [282]:
# Display the confusion matrix
cm5 = confusion_matrix(y_test, y_pred)
cm5_df =pd.DataFrame(cm5, index=["Actual Bad Review", "Actual Good Review"], columns=["Predicted Bad Review", "Predicted Good Review"])
cm5_df

Unnamed: 0,Predicted Bad Review,Predicted Good Review
Actual Bad Review,182,113
Actual Good Review,165,204


In [283]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

 bad_review       0.52      0.62      0.55      0.57      0.58      0.34       295
good_review       0.64      0.55      0.62      0.59      0.58      0.34       369

avg / total       0.59      0.58      0.59      0.58      0.58      0.34       664



In [284]:
# List the features sorted in descending order by feature importance
importances = sorted(zip(random_model.feature_importances_,X.columns), reverse=True)
for importance in importances:
    print(f'{importance[1]}: {importance[0]*100:.1f}%:')

host_response_rate: 13.2%:
location_other: 3.8%:
cp_moderate: 3.7%:
cp_flexible: 3.5%:
cp_strict: 3.4%:
pt_apartment: 3.4%:
location_capital_hill: 3.2%:
pt_house: 3.2%:
host_does_have_identity_ver: 3.1%:
rt_private_room: 3.1%:
host_does_not_have_identity_ver: 3.0%:
rt_entire_home: 2.9%:
instantly_bookable: 2.8%:
location_central_area: 2.8%:
location_ballard: 2.7%:
not_instantly_bookable: 2.7%:
location_queen_anne: 2.5%:
location_downtown: 2.5%:
location_rainier_vally: 1.9%:
location_west_seattle: 1.9%:
location_beacon_hill: 1.8%:
location_university_district: 1.5%:
host_response_time_within a few hours: 1.5%:
rt_within_hours: 1.5%:
rt_few_hours: 1.5%:
host_response_time_within an hour: 1.4%:
host_response_time_within a day: 1.3%:
rt_within_day: 1.3%:
bt_real_bed: 1.2%:
pt_townhouse: 1.2%:
location_northgate: 1.2%:
does_not_require_guest_phone_verification: 1.1%:
location_delridge: 1.1%:
requires_guest_phone_verification: 1.0%:
location_lake_city: 1.0%:
does_not_require_guest_prof_pic: 

Easy Ensemble 

In [285]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
easy_model.fit(X_train, y_train)
print(Counter(y_train["rating_good_or_bad"]))

Counter({'good_review': 1124, 'bad_review': 867})


In [286]:
# Calculated the balanced accuracy score
y_pred = easy_model.predict(X_test)
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test,y_pred)

0.5892930963207936

In [287]:
# Display the confusion matrix
cm6 = confusion_matrix(y_test, y_pred)
cm6_df =pd.DataFrame(cm6, index=["Actual Bad Review", "Actual Good Review"], columns=["Predicted Bad Review", "Predicted Good Review"])
cm6_df

Unnamed: 0,Predicted Bad Review,Predicted Good Review
Actual Bad Review,175,120
Actual Good Review,153,216


In [288]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

 bad_review       0.53      0.59      0.59      0.56      0.59      0.35       295
good_review       0.64      0.59      0.59      0.61      0.59      0.35       369

avg / total       0.59      0.59      0.59      0.59      0.59      0.35       664

