# 1. Logistic Regression

In [79]:
import pandas as pd

business_df = pd.read_csv("../../Data/02_Clean_Business_Data_Add_Attrs.csv")

# Categorizing restaurants based on stars ratings
business_df["Category"] = pd.cut(business_df["Stars_Rating"],bins=[0.9,2,3,4,5],
                                 labels=["Poor","Average","Good","Successful"])

def changeStatus(status):
    if status == "Poor":
        return 0
    elif status == "Average":
        return 1
    elif status ==  "Good":
        return 2
    else:
        return 3

business_df['Category_Encoded'] = business_df["Category"].apply(changeStatus)
business_df["Category_Encoded"] = pd.to_numeric(business_df["Category_Encoded"])

In [80]:
business_df["Category"].value_counts()

Good          15369
Successful     5373
Average        5318
Poor           1147
Name: Category, dtype: int64

In [81]:
business_df["Stars_Rating"].describe()

count    27207.000000
mean         3.661631
std          0.698279
min          1.000000
25%          3.500000
50%          4.000000
75%          4.000000
max          5.000000
Name: Stars_Rating, dtype: float64

In [82]:
business_df.columns

Index(['Unnamed: 0', 'Restaurant_ID', 'Restaurants_Name', 'Address', 'City',
       'State', 'Postal_Code', 'Latitude', 'Longitude', 'Stars_Rating',
       'Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions', 'Category', 'Category_Encoded'],
      dtype='object')

In [83]:
# Define features set
X = business_df[['Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions']]

In [84]:
# Define the target
y = business_df["Category_Encoded"]
y

0        2
1        3
2        2
3        3
4        1
        ..
27202    1
27203    2
27204    3
27205    2
27206    3
Name: Category_Encoded, Length: 27207, dtype: int64

In [86]:
# Split the model into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, 
                                                   random_state=1, 
                                                    stratify=y)

In [24]:
# Instantiate a Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="lbfgs", random_state=1)
model

LogisticRegression(random_state=1)

In [25]:
# Train the model
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [26]:
# Make predictions
y_pred = model.predict(X_test)

In [33]:
# Evaluate the Model
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5754189944134078

In [34]:
# Training balanced accuracy
y_pred_train = model.predict(X_train)
accuracy_score(y_train, y_pred_train)

0.571134525851507

In [35]:
# Calculate the confusion matrix 
from sklearn.metrics import confusion_matrix, classification_report

matrix=confusion_matrix(y_test, y_pred)
print(matrix)

[[   0   12  263   12]
 [   0   32 1248   50]
 [   0   16 3635  191]
 [   0    1 1095  247]]


In [36]:
report=classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       287
           1       0.52      0.02      0.05      1330
           2       0.58      0.95      0.72      3842
           3       0.49      0.18      0.27      1343

    accuracy                           0.58      6802
   macro avg       0.40      0.29      0.26      6802
weighted avg       0.53      0.58      0.47      6802



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Logistic Regression with Oversampling

In [37]:
from collections import Counter
Counter(y)

Counter({2: 15369, 3: 5373, 1: 5318, 0: 1147})

It seems like the classes are unbalanced, which might bias the results toward the majority classes. 

# Naive Random Oversampling


In random oversampling, instances of the minority class are randomly selected and added to the training set until the majority and minority classes are balanced.  

In [48]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({3: 11527, 1: 11527, 2: 11527, 0: 11527})

In [49]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [50]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.47136385198086184

In [51]:
# Training Accuracy
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.468030374069019

In [52]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[ 208,   46,   12,   21],
       [ 445,  490,  197,  198],
       [ 687, 1141, 1010, 1004],
       [ 183,  207,  242,  711]])

In [53]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.14      0.72      0.80      0.23      0.76      0.57       287
          1       0.26      0.37      0.75      0.30      0.52      0.26      1330
          2       0.69      0.26      0.85      0.38      0.47      0.21      3842
          3       0.37      0.53      0.78      0.43      0.64      0.40      1343

avg / total       0.52      0.36      0.81      0.37      0.53      0.27      6802



# SMOTE Oversampling

In SMOTE, like naive random oversampling, the size of the minority is increased. The key difference with random oversampling is how the minority class is increased in size. 

However, in SMOTE, new instances are interpolated. That is, for an instance from the minority class, new values are generated based on its distance from its neighbors.

**Random oversampling draws from existing observations, whereas SMOTE generates synthetic observations**.

In [54]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy="auto").fit_resample(X_train, y_train)

from collections import Counter

Counter(y_resampled)

Counter({3: 11527, 1: 11527, 2: 11527, 0: 11527})

In [55]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [56]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.4623746831469825

In [57]:
# Training Accuracy
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.45561298965939667

In [58]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 188,   54,   21,   24],
       [ 336,  488,  328,  178],
       [ 530, 1067, 1420,  825],
       [ 129,  214,  385,  615]])

In [59]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.16      0.66      0.85      0.26      0.74      0.54       287
          1       0.27      0.37      0.76      0.31      0.53      0.27      1330
          2       0.66      0.37      0.75      0.47      0.53      0.27      3842
          3       0.37      0.46      0.81      0.41      0.61      0.36      1343

avg / total       0.51      0.40      0.77      0.42      0.55      0.30      6802



# Cluster Centroids Undersampling

Akin to SMOTE, the algorithm identifies clusters of the majority class, then generates synthetic data points, called centroids, that are representative of the clusters. The majority class is then undersampled down to the size of the minority class. 

In [60]:
# Resample the data using the ClusterCentroids resampler

from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

from collections import Counter

Counter(y_resampled)

Counter({0: 860, 1: 860, 2: 860, 3: 860})

In [61]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [62]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [63]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.3467085388784826

In [64]:
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.3528954290187072

In [65]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 241,   36,    5,    5],
       [ 809,  467,   18,   36],
       [1631, 1771,   95,  345],
       [ 567,  487,   59,  230]])

In [66]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.07      0.84      0.54      0.14      0.67      0.47       287
          1       0.17      0.35      0.58      0.23      0.45      0.20      1330
          2       0.54      0.02      0.97      0.05      0.16      0.02      3842
          3       0.37      0.17      0.93      0.23      0.40      0.15      1343

avg / total       0.41      0.15      0.87      0.12      0.28      0.10      6802



# Combination of Over and Under-Sampling

A downside of oversampling with SMOTE is its reliance on the immediate neighbors of a data point. Because the algorithm doesn't see the overall distribution of data, the new data it creates can be heavily influenced by outliers. 

A downside of undersampling is that it involves loss of data and is not an option when the dataset is small. 

SMOTEEN combines the SMOTE and Edited Nearest Neighbors (ENN) algorithms. It includes the following steps: 
1. Oversample the minority class with SMOTE.
2. Clean the resulting data with an undersampling strategy. If the two of the nearest neighbors of a data point belong to different classes, the data point is dropped.

In [87]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

sm = SMOTEENN(random_state=1)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 6672, 1: 4262, 2: 2127, 3: 5123})

In [88]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [89]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.46552439101063103

In [90]:
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.46685252437552915

In [91]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 206,   51,   12,   18],
       [ 418,  488,  221,  203],
       [ 717, 1126,  990, 1009],
       [ 203,  190,  252,  698]])

In [92]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.13      0.72      0.79      0.23      0.76      0.57       287
          1       0.26      0.37      0.75      0.31      0.52      0.26      1330
          2       0.67      0.26      0.84      0.37      0.46      0.20      3842
          3       0.36      0.52      0.77      0.43      0.63      0.39      1343

avg / total       0.51      0.35      0.81      0.36      0.52      0.27      6802

