# Preliminary Analysis

#### To test different models using a preliminary dataset of the mock restaurant data

In [1]:
import numpy as np
import pandas as pd
from path import Path
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
data = Path('Resources/clean_britian_restaurants.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0.1,Unnamed: 0,awards,price_level,vegetarian_friendly,avg_rating,total_reviews_count,service
0,478379,1,2.0,Y,4.5,347.0,4.5
1,478384,0,2.0,N,3.5,32.0,3.5
2,478386,0,2.0,Y,4.0,101.0,4.5
3,478389,1,2.0,Y,4.5,104.0,4.5
4,478392,0,2.0,Y,3.5,34.0,3.0


In [3]:
df = df.drop(columns=['Unnamed: 0'])

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = df.copy()
df2['vegetarian_friendly'] = le.fit_transform(df2['vegetarian_friendly'])
df2.head()

Unnamed: 0,awards,price_level,vegetarian_friendly,avg_rating,total_reviews_count,service
0,1,2.0,1,4.5,347.0,4.5
1,0,2.0,0,3.5,32.0,3.5
2,0,2.0,1,4.0,101.0,4.5
3,1,2.0,1,4.5,104.0,4.5
4,0,2.0,1,3.5,34.0,3.0


In [5]:
df2.dtypes

awards                   int64
price_level            float64
vegetarian_friendly      int32
avg_rating             float64
total_reviews_count    float64
service                float64
dtype: object

In [6]:
df2.isnull().sum()

awards                   0
price_level              0
vegetarian_friendly      0
avg_rating             181
total_reviews_count    181
service                  0
dtype: int64

In [7]:
df2 = df2.dropna(how='any', subset=['avg_rating', 'total_reviews_count'])

In [8]:
df2.shape

(110123, 6)

## Logistic Regression

Logistic Regression is used when the dependent variable (target) is categorical.
Solves classification problems


In [9]:
# Separate the Features (X) from the Target (y)
y = df2["awards"]
X = df2.drop(columns="awards")

In [10]:
# Split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
X_train.shape

(77086, 5)

In [11]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(solver='lbfgs',
                                max_iter=500,
                                random_state=1)

In [12]:
# Train the model using the training data and make predictions
logmodel.fit(X_train, y_train)
ypredictions_log = logmodel.predict(X_test)
log_results = pd.DataFrame({"Prediction": ypredictions_log, "Actual": y_test}).reset_index(drop=True)
log_results.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,0,0
3,1,0
4,0,1
5,1,1
6,1,1
7,1,1
8,1,0
9,1,0


In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [14]:
print(accuracy_score(y_test, ypredictions_log))

0.7888730816962799


In [15]:
print(classification_report(y_test, ypredictions_log))

              precision    recall  f1-score   support

           0       0.80      0.79      0.79     16798
           1       0.78      0.79      0.79     16239

    accuracy                           0.79     33037
   macro avg       0.79      0.79      0.79     33037
weighted avg       0.79      0.79      0.79     33037



In [16]:
print(confusion_matrix(y_test, ypredictions_log))

[[13189  3609]
 [ 3366 12873]]


In [17]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
print(f"MSE: {mean_squared_error(y_test, ypredictions_log)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, ypredictions_log))}")
print(f"MAE: {mean_absolute_error(y_test, ypredictions_log)}")

MSE: 0.21112691830372007
RMSE: 0.4594854930285831
MAE: 0.21112691830372007


 ## Gradient Boosted Tree

In [18]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Choose best learning rate
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    GBT_model = GradientBoostingClassifier(n_estimators=100,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=1)

    # Fit the model
    GBT_model.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        GBT_model.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        GBT_model.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.903
Accuracy score (validation): 0.901

Learning rate:  0.1
Accuracy score (training): 0.903
Accuracy score (validation): 0.901

Learning rate:  0.25
Accuracy score (training): 0.904
Accuracy score (validation): 0.901

Learning rate:  0.5
Accuracy score (training): 0.905
Accuracy score (validation): 0.900

Learning rate:  0.75
Accuracy score (training): 0.905
Accuracy score (validation): 0.901

Learning rate:  1
Accuracy score (training): 0.904
Accuracy score (validation): 0.900



In [20]:
# Choose a learning rate and create classifier
GBT_model = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.75,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=1)

# Fit the model
GBT_model.fit(X_train_scaled, y_train)

# Make Prediction
ypredictions_GBT = GBT_model.predict(X_test_scaled)
GBT_results = pd.DataFrame({"Prediction": ypredictions_GBT, "Actual": y_test})
GBT_results.head()

Unnamed: 0,Prediction,Actual
6553,1,1
31983,0,0
72248,0,0
17141,1,0
16561,1,1


In [21]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, ypredictions_GBT)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.9006568393013894


In [22]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, ypredictions_GBT)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,15120,1678
Actual 1,1604,14635


In [23]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, ypredictions_GBT))

Classification Report
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     16798
           1       0.90      0.90      0.90     16239

    accuracy                           0.90     33037
   macro avg       0.90      0.90      0.90     33037
weighted avg       0.90      0.90      0.90     33037



In [24]:
print(f"MSE: {mean_squared_error(y_test, ypredictions_GBT)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, ypredictions_GBT))}")
print(f"MAE: {mean_absolute_error(y_test, ypredictions_GBT)}")

MSE: 0.09934316069861066
RMSE: 0.3151875008603778
MAE: 0.09934316069861066


## Easy Ensemble AdaBoost Classifier

In [25]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec_model = EasyEnsembleClassifier(n_estimators =100, random_state=1)
eec_model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [26]:
# Make predictions and determine accuracy score
ypredictions_EEC = eec_model.predict(X_test)
accuracy_score(y_test, ypredictions_EEC)

0.9010806065926082

In [27]:
# Display the confusion matrix

cm2 = confusion_matrix(y_test, ypredictions_EEC)

# Create a DataFrame from the confusion matrix.
eec_cm_df = pd.DataFrame(
    cm2, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
eec_cm_df.head()

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,15079,1719
Actual 1,1549,14690


In [28]:
# Print the  classification report
print(classification_report(y_test, ypredictions_EEC))

              precision    recall  f1-score   support

           0       0.91      0.90      0.90     16798
           1       0.90      0.90      0.90     16239

    accuracy                           0.90     33037
   macro avg       0.90      0.90      0.90     33037
weighted avg       0.90      0.90      0.90     33037



In [29]:
print(f"MSE: {mean_squared_error(y_test, ypredictions_EEC)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, ypredictions_EEC))}")
print(f"MAE: {mean_absolute_error(y_test, ypredictions_EEC)}")

MSE: 0.09891939340739171
RMSE: 0.3145145360828204
MAE: 0.09891939340739171
