# 1. Logistic Regression

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
from config import db_password
from sqlalchemy import inspect
import matplotlib.pyplot as plt
import pandas as pd

business_df = pd.read_csv("../Data/merged_datasets.csv")

# Categorizing restaurants based on stars ratings
business_df["Category"] = pd.cut(business_df["Stars_Rating"],bins=[0.9,2,3,4,5],labels=["Poor","Average","Good","Successful"])

# Since price can't be 0 and None, so replace it with a 1
def changeStatus(status):
    if status == "Poor":
        return 0
    elif status == "Average":
        return 1
    elif status ==  "Good":
        return 2
    else:
        return 3

business_df['Category_Encoded'] = business_df["Category"].apply(changeStatus)
business_df["Category_Encoded"] = pd.to_numeric(business_df["Category_Encoded"])


business_df["Median_Income(dollars)"] = pd.to_numeric(business_df["Median_Income(dollars)"], errors='coerce')
business_df["Mean_Income(dollars)"] = pd.to_numeric(business_df["Mean_Income(dollars)"], errors='coerce')

business_df = business_df.dropna(subset=['Median_Income(dollars)'])

In [2]:
business_df["Category"].value_counts()

Good          14026
Successful     4887
Average        4821
Poor           1015
Name: Category, dtype: int64

In [3]:
business_df["Stars_Rating"].describe()

count    24749.000000
mean         3.665845
std          0.694539
min          1.000000
25%          3.500000
50%          4.000000
75%          4.000000
max          5.000000
Name: Stars_Rating, dtype: float64

In [4]:
business_df.columns

Index(['Unnamed: 0', 'Restaurant_ID', 'Restaurants_Name', 'Address', 'City',
       'State', 'Postal_Code', 'Latitude', 'Longitude', 'Stars_Rating',
       'Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions', 'Total_Estimate_Households_per_Zip',
       'Total_Estimate_Married-couple_Family_households',
       'Total_Estimate_Nonfamily_households', 'Percentage_EH<$10,000',
       'Percentage_EH_$10,000-$14,999', 'Percentage_EH_$15,000-$24,999',
       'Percentage_EH_$25,000-$34,999', 'Percentage_EH_$35,000-$49,999',
       'Percentage_EH_$50,000-$74,999', 'Percentage_EH_$75,000-$99,999',
       'Percentage_EH_$100,000-$149,999', 'Percentage_EH_$150,000-$199,999',
       'Percentage_EH_$200,000<', 'Median_Income(dollars)',
       'Mean_Income(dolla

In [5]:
# Define features set
X = business_df[['Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions',
                'Total_Estimate_Married-couple_Family_households',
       'Total_Estimate_Nonfamily_households',
                'Median_Income(dollars)', 'Population']]

In [6]:
# Define the target
y = business_df["Category"]
y

0              Good
1        Successful
2              Good
3        Successful
4           Average
            ...    
24753       Average
24754          Good
24755    Successful
24756          Good
24757    Successful
Name: Category, Length: 24749, dtype: category
Categories (4, object): ['Poor' < 'Average' < 'Good' < 'Successful']

In [7]:
# Split the model into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, 
                                                   random_state=1, 
                                                    stratify=y)

In [8]:
# Instantiate a Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="lbfgs", random_state=1)
model

LogisticRegression(random_state=1)

In [9]:
# Train the model
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [10]:
# Make predictions
y_pred = model.predict(X_test)

In [11]:
# Evaluate the Model
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5667420814479638

In [12]:
# Calculate the confusion matrix 
from sklearn.metrics import confusion_matrix, classification_report

matrix=confusion_matrix(y_test, y_pred)
print(matrix)

[[   0 1205    0    0]
 [   0 3507    0    0]
 [   0  254    0    0]
 [   0 1222    0    0]]


In [13]:
report=classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

     Average       0.00      0.00      0.00      1205
        Good       0.57      1.00      0.72      3507
        Poor       0.00      0.00      0.00       254
  Successful       0.00      0.00      0.00      1222

    accuracy                           0.57      6188
   macro avg       0.14      0.25      0.18      6188
weighted avg       0.32      0.57      0.41      6188



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Logistic Regression with Oversampling

In [14]:
from collections import Counter
Counter(y)

Counter({'Good': 14026, 'Successful': 4887, 'Average': 4821, 'Poor': 1015})

It seems like the classes are unbalanced, which might bias the results toward the majority classes. 

# Naive Random Oversampling


In random oversampling, instances of the minority class are randomly selected and added to the training set until the majority and minority classes are balanced.  

In [15]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Average': 10519, 'Good': 10519, 'Successful': 10519, 'Poor': 10519})

In [16]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [17]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.3448593592243359

In [18]:
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.346572798641994

In [19]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[420, 173, 451, 161],
       [913, 720, 876, 998],
       [ 88,  19, 133,  14],
       [329, 222, 302, 369]])

In [20]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

    Average       0.24      0.35      0.73      0.28      0.51      0.25      1205
       Good       0.63      0.21      0.85      0.31      0.42      0.16      3507
       Poor       0.08      0.52      0.73      0.13      0.62      0.37       254
 Successful       0.24      0.30      0.76      0.27      0.48      0.22      1222

avg / total       0.46      0.27      0.80      0.29      0.45      0.20      6188



# SMOTE Oversampling

In SMOTE, like naive random oversampling, the size of the minority is increased. The key difference with random oversampling is how the minority class is increased in size. 

However, in SMOTE, new instances are interpolated. That is, for an instance from the minority class, new values are generated based on its distance from its neighbors.

**Random oversampling draws from existing observations, whereas SMOTE generates synthetic observations**.

In [21]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy="auto").fit_resample(X_train, y_train)

from collections import Counter

Counter(y_resampled)

Counter({'Average': 10519, 'Good': 10519, 'Successful': 10519, 'Poor': 10519})

In [22]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [23]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.3440471603402944

In [24]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 384,  212,  459,  150],
       [ 853, 1121,  889,  644],
       [  86,   17,  134,   17],
       [ 299,  361,  305,  257]])

In [25]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

    Average       0.24      0.32      0.75      0.27      0.49      0.23      1205
       Good       0.66      0.32      0.78      0.43      0.50      0.24      3507
       Poor       0.07      0.53      0.72      0.13      0.62      0.37       254
 Successful       0.24      0.21      0.84      0.22      0.42      0.16      1222

avg / total       0.47      0.31      0.78      0.35      0.49      0.23      6188



# Cluster Centroids Undersampling

Akin to SMOTE, the algorithm identifies clusters of the majority class, then generates synthetic data points, called centroids, that are representative of the clusters. The majority class is then undersampled down to the size of the minority class. 

In [26]:
# Resample the data using the ClusterCentroids resampler

from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

from collections import Counter

Counter(y_resampled)

Counter({'Average': 761, 'Good': 761, 'Poor': 761, 'Successful': 761})

In [27]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [28]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [29]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.3234260360020228

In [30]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 379,  106,  658,   62],
       [ 868,  499, 1504,  636],
       [  72,    8,  171,    3],
       [ 304,  156,  562,  200]])

In [31]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

    Average       0.23      0.31      0.75      0.27      0.49      0.23      1205
       Good       0.65      0.14      0.90      0.23      0.36      0.12      3507
       Poor       0.06      0.67      0.54      0.11      0.60      0.37       254
 Successful       0.22      0.16      0.86      0.19      0.37      0.13      1222

avg / total       0.46      0.20      0.85      0.23      0.40      0.15      6188



# Combination of Over and Under-Sampling

A downside of oversampling with SMOTE is its reliance on the immediate neighbors of a data point. Because the algorithm doesn't see the overall distribution of data, the new data it creates can be heavily influenced by outliers. 

A downside of undersampling is that it involves loss of data and is not an option when the dataset is small. 

SMOTEEN combines the SMOTE and Edited Nearest Neighbors (ENN) algorithms. It includes the following steps: 
1. Oversample the minority class with SMOTE.
2. Clean the resulting data with an undersampling strategy. If the two of the nearest neighbors of a data point belong to different classes, the data point is dropped.

In [32]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

sm = SMOTEENN(random_state=1)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'Average': 3310, 'Good': 1184, 'Poor': 6738, 'Successful': 2781})

In [33]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [34]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.28698953796394144

In [35]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[   3,    4, 1127,   71],
       [  31,   35, 2881,  560],
       [   0,    0,  251,    3],
       [   6,   13, 1023,  180]])

In [36]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

    Average       0.07      0.00      0.99      0.00      0.05      0.00      1205
       Good       0.67      0.01      0.99      0.02      0.10      0.01      3507
       Poor       0.05      0.99      0.15      0.09      0.39      0.16       254
 Successful       0.22      0.15      0.87      0.18      0.36      0.12      1222

avg / total       0.44      0.08      0.93      0.05      0.15      0.04      6188

