In [1]:
import pandas as pd
df = pd.read_csv('MergedandCleaned1.csv')
print(df.head())

                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1  BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
3  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   
4  JrIxlS1TzJ-iCu79ul40cQ  eUta8W_HdHMXPzLBBZhL1A  04UD14gamNjLY0IDYVhHJg   

   review_stars  useful  funny  cool  \
0             3       0      0     0   
1             5       1      0     1   
2             5       1      0     1   
3             4       1      0     1   
4             1       1      2     1   

                                                text          review_date  \
0  If you decide to eat here, just be aware it is...  2018-07-07 22:09:11   
1  I've taken a lot of spin classes over the year...  2012-01-03 15:28:18   
2  Wow!  Yummy, different,  delicious.   Our favo...  2015-01-04 

In [3]:
print(df.columns)

Index(['review_id', 'user_id', 'business_id', 'review_stars', 'useful',
       'funny', 'cool', 'text', 'review_date', 'business_name', 'address',
       'city', 'state', 'postal_code', 'latitude', 'longitude',
       'business_stars', 'business_review_count', 'is_open',
       'business_attributes', 'business_categories', 'business_hours'],
      dtype='object')


In [7]:
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

address    106
dtype: int64


In [8]:
df_cleaned = df.drop(columns=['address'])
print(df_cleaned.columns)

Index(['review_id', 'user_id', 'business_id', 'review_stars', 'useful',
       'funny', 'cool', 'text', 'review_date', 'business_name', 'city',
       'state', 'postal_code', 'latitude', 'longitude', 'business_stars',
       'business_review_count', 'is_open', 'business_attributes',
       'business_categories', 'business_hours'],
      dtype='object')


In [9]:
missing_values = df_cleaned.isnull().sum()
print(missing_values)


review_id                0
user_id                  0
business_id              0
review_stars             0
useful                   0
funny                    0
cool                     0
text                     0
review_date              0
business_name            0
city                     0
state                    0
postal_code              0
latitude                 0
longitude                0
business_stars           0
business_review_count    0
is_open                  0
business_attributes      0
business_categories      0
business_hours           0
dtype: int64


In [10]:
missing_values = df.isnull().sum()
print(missing_values)


review_id                  0
user_id                    0
business_id                0
review_stars               0
useful                     0
funny                      0
cool                       0
text                       0
review_date                0
business_name              0
address                  106
city                       0
state                      0
postal_code                0
latitude                   0
longitude                  0
business_stars             0
business_review_count      0
is_open                    0
business_attributes        0
business_categories        0
business_hours             0
dtype: int64


In [11]:
df['address'].fillna('Unknown', inplace=True)


In [12]:
print(df_cleaned.dtypes)

review_id                 object
user_id                   object
business_id               object
review_stars               int64
useful                     int64
funny                      int64
cool                       int64
text                      object
review_date               object
business_name             object
city                      object
state                     object
postal_code               object
latitude                 float64
longitude                float64
business_stars           float64
business_review_count    float64
is_open                  float64
business_attributes       object
business_categories       object
business_hours            object
dtype: object


In [13]:
df_cleaned['is_open'] = df_cleaned['is_open'].astype(int)
df_cleaned['review_date'] = pd.to_datetime(df_cleaned['review_date'])
print(df_cleaned[['is_open', 'review_date']].dtypes)

is_open                 int32
review_date    datetime64[ns]
dtype: object


In [14]:
import ast


def extract_parking_feature(attributes, feature):
    if pd.isna(attributes):
        return 0 
    try:
        attributes_dict = ast.literal_eval(attributes)  
        if isinstance(attributes_dict, dict) and 'BusinessParking' in attributes_dict:
            parking_dict = ast.literal_eval(attributes_dict['BusinessParking'])
            if isinstance(parking_dict, dict):
                return parking_dict.get(feature, 0)
    except (ValueError, SyntaxError, TypeError):
        return 0 
    return 0

df_cleaned['garage'] = df_cleaned['business_attributes'].apply(lambda x: extract_parking_feature(x, 'garage'))
df_cleaned['street'] = df_cleaned['business_attributes'].apply(lambda x: extract_parking_feature(x, 'street'))
df_cleaned['lot'] = df_cleaned['business_attributes'].apply(lambda x: extract_parking_feature(x, 'lot'))
df_cleaned['valet'] = df_cleaned['business_attributes'].apply(lambda x: extract_parking_feature(x, 'valet'))

print(df_cleaned[['garage', 'street', 'lot', 'valet']].head())


  garage street    lot  valet
0  False  False   True  False
1  False   True  False  False
2  False  False   True  False
3  False   True  False  False
4  False   True  False  False


In [15]:
df_final = df_cleaned.drop(columns=['business_attributes'])
final_features = ['garage', 'street', 'lot', 'valet', 'is_open', 'business_review_count', 'latitude', 'longitude']
X = df_final[final_features]
print(X.head())

  garage street    lot  valet  is_open  business_review_count   latitude  \
0  False  False   True  False        1                  169.0  40.210196   
1  False   True  False  False        0                  144.0  39.952103   
2  False  False   True  False        1                  181.0  40.079848   
3  False   True  False  False        0                   32.0  29.962102   
4  False   True  False  False        0                  273.0  39.938013   

   longitude  
0 -75.223639  
1 -75.172753  
2 -75.025080  
3 -90.087958  
4 -75.148131  


In [21]:
print(df_cleaned['valet'].unique())

[False True]


In [22]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

df_cleaned['valet'] = df_cleaned['valet'].astype(int)

X = df_cleaned[['latitude', 'longitude', 'business_review_count', 'is_open']] 
y = df_cleaned['valet'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

joblib.dump(xgb_model, 'parking_xgboost_model.pkl')
print("Model saved as 'parking_xgboost_model.pkl'")

Accuracy: 0.9974955277280859
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5220
           1       1.00      0.96      0.98       370

    accuracy                           1.00      5590
   macro avg       1.00      0.98      0.99      5590
weighted avg       1.00      1.00      1.00      5590

Model saved as 'parking_xgboost_model.pkl'


Parameters: { "use_label_encoder" } are not used.

