## Preprocessing

### Loading Data

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as mtick
from matplotlib.pylab import rcParams
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, mean_squared_log_error, roc_curve, auc
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import plot_confusion_matrix
from xgboost import XGBClassifier
import numpy as np

pd.set_option('display.max_rows', 1000)
plt.style.use('fivethirtyeight')

### Full_df: Dataframe Containing All Available Columns


In [2]:
#Data obtained from http://insideairbnb.com/san-diego
full_df = pd.read_csv('listings.csv')

In [3]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14188 entries, 0 to 14187
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            14188 non-null  int64  
 1   listing_url                                   14188 non-null  object 
 2   scrape_id                                     14188 non-null  int64  
 3   last_scraped                                  14188 non-null  object 
 4   source                                        14188 non-null  object 
 5   name                                          14188 non-null  object 
 6   description                                   14060 non-null  object 
 7   neighborhood_overview                         9306 non-null   object 
 8   picture_url                                   14188 non-null  object 
 9   host_id                                       14188 non-null 

In [4]:
base_df = full_df[['price', 'review_scores_rating', 'review_scores_accuracy',
                      'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
                      'review_scores_location', 'review_scores_value','accommodates', 'bedrooms', 'beds',
                      'instant_bookable', 'property_type', 'room_type', 'amenities', 'availability_365', 
                      'availability_30','availability_90','host_id', 'calculated_host_listings_count', 
                      'host_response_time', 'host_response_rate','host_is_superhost']]


In [5]:
df = base_df

### Exploratory Data Analysis
- Investingating the various features of my dataset to determine which features to use in my model and anaylsis, and to what extent.
#### Fixing Price
- Price is currently a string. I need to strip out the extra characters and convert the datatype to Float so that I can better utilize the data



In [6]:
df['price'].head(5)


0    $225.00
1    $113.00
2    $258.00
3    $336.00
4    $333.00
Name: price, dtype: object

In [7]:
#using lambda function to strip $ and , out of each price record. replacing with blank space.
df['price'] = df['price'].map(lambda x: x.replace('$',''))
df['price'] = df['price'].map(lambda x: x.replace(',',''))
df['price'] = df['price'].astype(float) #changing cleaned column to float
df['price'].head(2)

0    225.0
1    113.0
Name: price, dtype: float64

### New Feature: Host Listings_5-
- Creating a new feature that classifies whether a "many" listings or not

In [8]:
#getting key metrics for this feature.
df['calculated_host_listings_count'].describe()

count    14188.000000
mean        17.266422
std         35.920780
min          1.000000
25%          1.000000
50%          3.000000
75%         13.000000
max        213.000000
Name: calculated_host_listings_count, dtype: float64

Analysis:
- The majority of hosts in this dataset have between 1-14 listings. (25%-75%).
- The median is 3.
- One has 213 listings

In [9]:
#checking to see how many records have omly 1 or 2 listings vs the rest of the. records.
low_listings = df['calculated_host_listings_count'] <=2

low_listings.value_counts()

False    7608
True     6580
Name: calculated_host_listings_count, dtype: int64

Since so many hosts have just 1 or 2 rental units, everythink is skewed toward the lower end. However, I am setting  this classifier at 5 and under as people with multiple listings will be more likely to use OPC's service.


In [10]:
#creating classifier and checking to see how the data is split.
df['capacity_5+'] = df['accommodates'] >=5
df['capacity_5+'].value_counts()

False    8221
True     5967
Name: capacity_5+, dtype: int64

 This seems to be a good classifier as the split ends up being close to 50%.
### New Feature Bedrooms_2+

In [11]:
df['bedrooms'].describe()

count    12915.000000
mean         1.994580
std          1.235842
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         23.000000
Name: bedrooms, dtype: float64

#### Analysis:

- Mean and Median are both roughly 2 Bedrooms, so I will set the classifer at 2 and above.

In [12]:
df['bedrooms_2+'] = df['bedrooms'] >=2
df['bedrooms_2+'].value_counts()

False    7121
True     7067
Name: bedrooms_2+, dtype: int64

### New Feature: Booking Rates
- seeing the rumber of available days is good, but in some cases it may be more helpful to see this at a percentage.

In [13]:
#Changing availability to a percentage named availability rate.
df['availability_30_rate'] = df['availability_30'].apply(lambda x: x/ 30)
df['availability_90_rate'] = df['availability_90'].apply(lambda x: x/ 90)


In [14]:
#Changing the availability rate to the percentage of the time period that the unit is booked.
df['booked_rate_30'] = df['availability_30_rate'].apply(lambda x: 1 - x)
df['booked_rate_90'] = df['availability_90_rate'].apply(lambda x: 1 - x)

In [15]:
avaibility = df[['availability_30_rate', 'booked_rate_30','availability_90_rate','booked_rate_90']]
avaibility.head()

Unnamed: 0,availability_30_rate,booked_rate_30,availability_90_rate,booked_rate_90
0,0.0,1.0,0.066667,0.933333
1,0.666667,0.333333,0.6,0.4
2,0.0,1.0,0.0,1.0
3,0.533333,0.466667,0.488889,0.511111
4,0.2,0.8,0.466667,0.533333


### New Feature: Bookings Above Average
- I have determined that price is not a great metric for measuring rentals because the prices are relative, and no two units are exactly the same.
- However, the main thing that hosts want is to maximimze their bookings. So I want to capture and analyze how much availabilty they have so I that I have a metric to compare across the board.

In [16]:
df["bookings_above_avg"] = df['booked_rate_90'] >= .512
df['bookings_above_avg'].value_counts()

False    8670
True     5518
Name: bookings_above_avg, dtype: int64

#### New Feature: Host Respnse Rate 100
- Feature that determines weather a host has a perfect response rate.
- SuperHost status requires a minimum of %90 response rate.


In [17]:
#creating a classifier that captures whether a host has a perfect response rate or not.
df['host_response_rate'] = df['host_response_rate'].str.replace('%', ' ')
df['host_response_rate']  = df['host_response_rate'].astype('float')
df['host_response_100'] = df['host_response_rate'] == 100.0
df['host_response_100'].value_counts()

True     9916
False    4272
Name: host_response_100, dtype: int64

### Fixing Host is Superhost & Instant Bookable
- Features are currently strings instead of bools.

In [18]:
#setting up up a bool based on the old string data.
df['superhost'] = df['host_is_superhost'] == 't'
df['instant_bookable'] = df['instant_bookable'] == 't'

In [19]:
#making sure that I captured both the True and False classification.
df['superhost'].value_counts()


False    8782
True     5406
Name: superhost, dtype: int64

In [20]:
#making sure that I captured both the True and False Classifications.
df['instant_bookable'].value_counts()

False    7306
True     6882
Name: instant_bookable, dtype: int64

#### Target Feature: Elite Units
- This is my target feature. It classifies whether a unit is in our target 4.9 - 5.0 overall rating range or not.


#### Dealing with Nulls

In [21]:
#seeing how many records dont have a review score overall rating
df['review_scores_rating'].isna().sum()

1665

There are 1655 Null records that need to be dealt with. If I drop them, I will lose 15% my data.

In [22]:
nulls = df[df['review_scores_rating'].isna()]

In [23]:
len(nulls)

1665

In [24]:
nulls.head(5)

Unnamed: 0,price,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,accommodates,bedrooms,...,host_is_superhost,capacity_5+,bedrooms_2+,availability_30_rate,availability_90_rate,booked_rate_30,booked_rate_90,bookings_above_avg,host_response_100,superhost
104,307.0,,,,,,,,6,2.0,...,f,True,True,1.0,0.988889,0.0,0.011111,False,False,False
118,900.0,,,,,,,,10,5.0,...,f,True,True,0.0,0.0,1.0,1.0,True,False,False
180,150.0,,,,,,,,6,1.0,...,t,True,False,0.0,0.022222,1.0,0.977778,True,True,True
183,235.0,,,,,,,,6,1.0,...,t,True,False,0.0,0.0,1.0,1.0,True,True,True
274,404.0,,,,,,,,8,3.0,...,f,True,True,0.0,0.0,1.0,1.0,True,True,False


In [25]:
df.dropna(subset=['review_scores_rating'], how='all', inplace = True)

In [26]:
len(df)

12523

##### 12523 Records are left after dropping null values
##### Creating Elite Unite Classifier
- I have decided to classify "5 star" units s ones that have a 4.9 or higher oerall rating.
- 4.9 is still an incredibly high score, and is obove thresholds for success (4.8 rating, etc), so it is well worth capturing units with a 4.9 rating as high performers as well

In [27]:
#creating classifier and then checking to see how many units ae in each category.
df['elite'] = df['review_scores_rating'] >=4.9
df['elite'].value_counts()

False    7385
True     5138
Name: elite, dtype: int64

#### 41% of my dataset are Elite Units
Out of 12523 Airbnb rental units, 41%(5138) are elite units, while 59(7385) are not

### Room Type

In [28]:
#creating a new feature which turn room type into a binary classifier.
df['entire_home'] = df['room_type'] == 'Entire home/apt'
df['entire_home'].value_counts()

True     10503
False     2020
Name: entire_home, dtype: int64

In [29]:
#dropping room_typesince I now classifier inits place
df.drop(['room_type'], axis=1,inplace=True)

### Host Response Time

In [30]:
#checking to see how many records I have of each response speed.
df['host_response_time'].value_counts()

within an hour        9676
within a few hours    1237
within a day           637
a few days or more     149
Name: host_response_time, dtype: int64

In [31]:
#The majority of responses were "with an hour".
# I will change this into a binary classifier
df['response_within_hour'] = df['host_response_time'] == 'within an hour'
df['response_within_hour'].value_counts()

True     9676
False    2847
Name: response_within_hour, dtype: int64

In [32]:
df.drop(['host_response_time'], axis=1, inplace=True)

#### Creating Review Metric Classifier Columns
- These columns will capture the number of 5 star reviews left for each review metric.
- Just like with my target classifier (5-Star), I am counting 4.9s in with the 5.0s.


In [33]:
#Creating a classifier for each review metric with the same critieria as my target (4.9 - 5.0 Scores)

df['accuracy_5'] = df['review_scores_accuracy'] >= 4.9
df['cleanliness_5'] =  df['review_scores_cleanliness'] >= 4.9
df['checkin_5'] = df['review_scores_checkin'] >= 4.9
df['location_5'] = df['review_scores_location'] >=4.9
df['value_5'] = df['review_scores_value'] >= 4.9
df['communication_5'] = df['review_scores_communication'] >= 4.9


In [34]:
#Printing a list with the number of units that are Elite in each category.
print("Number of Elite Accuracy Units:", len(df[df['accuracy_5']== True]))
print("Number of Elite Cleanliness Units:", len(df[df['cleanliness_5']== True]))
print("Number of Elite Checkin Units:", len(df[df['checkin_5']== True]))
print("Number of Elite Location Units:", len(df[df['location_5']== True]))
print("Number of Elite Value Units:", len(df[df['value_5']== True]))
print("Number of Elite Communication Units:", len(df[df['communication_5']== True]))

Number of Elite Accuracy Units: 6324
Number of Elite Cleanliness Units: 5425
Number of Elite Checkin Units: 8486
Number of Elite Location Units: 7117
Number of Elite Value Units: 3345
Number of Elite Communication Units: 8058


There are significantly less units that have Elite Value. I am goin to do a value count of that classifier to take a closer look.

In [35]:
df['value_5'].value_counts()

False    9178
True     3345
Name: value_5, dtype: int64

### New Feature: Price Above Median

In [36]:
#checking the mean, standard deviation, median and quatiles of price
df['price'].describe()

count     12523.000000
mean        334.963347
std        1192.884956
min          10.000000
25%         115.000000
50%         181.000000
75%         318.000000
max      100000.000000
Name: price, dtype: float64

It is difficult to analyze price because it is relative. That said, I will create a classifier to determine whether a unit is above or belov the average(mean) price. (I rounded the mean of 279 to 280)

In [37]:
df['price_280+'] =df['price'] >=280

In [38]:
df['price_280+'].value_counts()

False    8756
True     3767
Name: price_280+, dtype: int64

### Creating Analysis_df

In [39]:
#copying my dataframe as 'analysis_df' so I can easily pull back up my df with all classifiers in place.

analysis_df = df.copy()

## Preparing for Modeling

In [40]:
#checking to see what my dataframe currently looks like

analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12523 entries, 0 to 14187
Data columns (total 40 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   price                           12523 non-null  float64
 1   review_scores_rating            12523 non-null  float64
 2   review_scores_accuracy          12502 non-null  float64
 3   review_scores_cleanliness       12502 non-null  float64
 4   review_scores_checkin           12500 non-null  float64
 5   review_scores_communication     12502 non-null  float64
 6   review_scores_location          12500 non-null  float64
 7   review_scores_value             12500 non-null  float64
 8   accommodates                    12523 non-null  int64  
 9   bedrooms                        11366 non-null  float64
 10  beds                            12392 non-null  float64
 11  instant_bookable                12523 non-null  bool   
 12  property_type                   

## One Hot Encoding

In [41]:
#Creating a variable which captures the features that need to be one hot encoded

need_to_encode = df[['price_280+', 'elite', 'accuracy_5', 'cleanliness_5', 'checkin_5', 'location_5', 'value_5',
                     'communication_5', 'entire_home', 'bedrooms_2+',
                     'bookings_above_avg', 'instant_bookable', 'capacity_5+', 'calculated_host_listings_count', 'superhost', 'host_response_100', 'response_within_hour']]
#calling encoder and fitting it to the features that need to be encoded.                    
ohe = OneHotEncoder()
ohe.fit(need_to_encode)

#transforming the encoder output so that it can be modeled.
ohe_1 = ohe.transform(need_to_encode).toarray()

#adding labels
ohe_df = pd.DataFrame(ohe_1, columns=ohe.get_feature_names(need_to_encode.columns))
ohe_df.head(5)

Unnamed: 0,price_280+_False,price_280+_True,elite_False,elite_True,accuracy_5_False,accuracy_5_True,cleanliness_5_False,cleanliness_5_True,checkin_5_False,checkin_5_True,...,calculated_host_listings_count_97,calculated_host_listings_count_131,calculated_host_listings_count_146,calculated_host_listings_count_213,superhost_False,superhost_True,host_response_100_False,host_response_100_True,response_within_hour_False,response_within_hour_True
0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [42]:
#creating 'cleaned_df'  as a copy of 'ohe_df' sothat I have a saved version of the df up to this point.
cleaned_df = ohe_df.copy()


### Dropping One Value for Categoricals

In [43]:
#dropping one value from all categorical values to prevent redundancy.
cleaned_df.drop(['elite_False', 'accuracy_5_False', 'cleanliness_5_False', 'checkin_5_False', 'location_5_False',
                 'value_5_False', 'communication_5_False', 'bedrooms_2+_False',
                 'bookings_above_avg_False', 'instant_bookable_False', 'capacity_5+_False', 
                 'calculated_host_listings_count_97', 'entire_home_False', 'price_280+_False',
                 'superhost_False', 'host_response_100_False', 'response_within_hour_False',
                ], axis=1, inplace=True)


### Dealing With Class imbalance
- Solution
  - Always use class weight parameter in Decision TreeClassifier
  - Always stratify Train Test Split
  - Add SMOTE to Training Sets.

In [44]:
#Checking to make sure that my target was properly encoded.
cleaned_df['elite_True'].value_counts()

0.0    7385
1.0    5138
Name: elite_True, dtype: int64

### Train Test Split
 - Creating seperate Traning and Test Groups for modeling.

In [47]:
#creating 'balanced_df', which will end up being my df with balanced data
balanced_df = cleaned_df.copy()

#islolating my target(y), and all other data(X)
X = balanced_df.drop(['elite_True'], axis=1)
y = balanced_df['elite_True']

#Splitting X and y into training and test sets, with 25% of the data in the test set.
#Stratifying the split to minimize class imbalance.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, stratify=y, random_state=23)

#Using SMOTE to further minimize any class imbalance.
smote = SMOTE(random_state=23)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) 


### Choosing Evaluation Metrics
- My goal is to predict wheather a person will get a 4.9-5.0 Airbnb Overall rating.
- Which is worse?
   - Model predicts that a unit is an Elie Unit, but they actually are not ?(more false negative)

### Decision
- I want to false Positive to be as low as possible 
- if my model says that a property is an Elite Unit, I want it to be true.
- if it misses some of the Elite units in the process, that is fine.
- Therefore, Iam most concerned with Precision, balanced out by F1 score.
### Metrics Function


In [48]:
#creating 'get_metrics' function
def get_metrics(clf, y_pred):
    """ Function that calculates the key metrics that I want to analyze for my models. It also leaves out unnecceray evaluation metrics that I don`t need to see."""
    
    clf_prec = precision_score(y_test, y_pred) * 100
    print('Precision is :{0}' .format(clf_prec))
    
    clf_f1 = f1_score(y_test,y_pred) * 100
    print('F1 Score is :{0}' .format(clf_f1))
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_pred)
    
    clf_roc_auc = auc(false_positive_rate, true_positive_rate)
    print('ROC AUC is :{0}'.format(round(clf_roc_auc,2)))
    
    clf_cv_score = np.mean(cross_val_score(clf, X_train_resampled, y_train_resampled, cv=10))
    print('Cross Validation Score is :{0}' .format(round(clf_cv_score, 3)))

## Modeling
### Baseline Decision Tree

In [49]:
#Starting with a basic decision tree. Making sure that class weights are balanced.

dt1 = DecisionTreeClassifier(random_state=23, class_weight='balanced')
dt1.fit(X_train_resampled, y_train_resampled)
dt1_y_pred = dt1.predict(X_test)
get_metrics(dt1,dt1_y_pred)

Precision is :77.18068535825545
F1 Score is :77.15064227325807
ROC AUC is :0.81
Cross Validation Score is :0.849


#### Baseline Model Analysis:
- A simple decision tree gives me a good starting point.
- The precision is above 77%, which is acceptable , as is the F1 score.
- The AUC Score is already at 81%, which is great for baseline!
- Likewise, the Cross Validation score is alrady looking good as it is nearly %84

### Decision Tree 2
##### Refining Decision Tree Through GridSearchCV


In [56]:
#Creating Grid Search to optimize Random Forest Parameters for Precision.
rf_param_grid = {
    'n_estimators': [10, 30, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 6, 10],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [3, 6]
}

In [63]:
# Instantiate GridSearchCV
dt2 = DecisionTreeClassifier(random_state=23)

#Running Gridsearch on my data. Making sure that it is optimizing Precision as that is my key metric.
dt_grid_search = GridSearchCV(dt2, dt_param_grid, cv=3, scoring = 'precision')

# Fit to the data
dt_grid_search.fit(X_train_resampled, y_train_resampled)
dt_grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [64]:
#Running the decision tree again. This time using the parameters as determined from the gried search.
dt2 = DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=2, min_samples_leaf=1,class_weight='balanced',random_state=23)
dt2.fit(X_train_resampled,y_train_resampled)
dt2_y_pred = dt2.predict(X_test)
get_metrics(dt2,dt2_y_pred)

Precision is :82.92682926829268
F1 Score is :81.11332007952285
ROC AUC is :0.84
Cross Validation Score is :0.863


### Analysis:

- All scores are improved!
- Everything is in the 80s which is excellent!
- I would be happy with this as a final model, but want to see if I can further improve my results with ensemble methods or gradient boosting.

### Random Forests

In [62]:
#Creating a Random Forests Classifier.
rf1_clf = RandomForestClassifier(random_state=23, class_weight='balanced')
rf1_clf.fit(X_train_resampled, y_train_resampled)
rf1_t_pred = rf1_clf.predict(X_test)
get_metrics(rf1_clf, rf1_t_pred)

Precision is :80.72196620583718
F1 Score is :81.25241592578276
ROC AUC is :0.84
Cross Validation Score is :0.875


### Analysis:

- This is an improvement over my baseline model. But it still isn't as good as my Optimized Decision Tree.
### Random Forests 2
### GridSearch CV

In [65]:
#Creating Grid Search to optimize Random Forest Parameters for Precision.
rf_param_grid = {
    'n_estimators': [10, 30, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 6, 10],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [3, 6]
}