In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load prepared data
prepared_data = pd.read_csv('restaurant_customer_satisfaction.csv')

# Separate features and target variable
X = prepared_data.drop(columns=['HighSatisfaction'])
y = prepared_data['HighSatisfaction']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


def clean_and_prepare_data():
    # Load data
    data = pd.read_csv('restaurant_customer_satisfaction.csv')

    # Drop CustomerID
    data = data.drop(columns=['CustomerID'])

    # Handle missing values (if any)
    data = data.dropna()

    # One-Hot Encoding categorical variables
    categorical_columns = ['Gender', 'VisitFrequency', 'PreferredCuisine', 'TimeOfVisit', 'DiningOccasion', 'MealType', 'OnlineReservation', 'DeliveryOrder', 'LoyaltyProgramMember']
    data = pd.get_dummies(data, columns=categorical_columns)

    # Normalizing numerical columns
    numerical_columns = ['Age', 'Income', 'AverageSpend', 'GroupSize', 'WaitTime', 'ServiceRating', 'FoodRating', 'AmbianceRating']
    scaler = StandardScaler()
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

    # The target variable
    target = data['HighSatisfaction']
    data = data.drop(columns=['HighSatisfaction'])

    # Combine data and target
    prepared_data = pd.concat([data, target], axis=1)
    
    print("Data cleaning and preparation complete.")
    
    # Separate features and target variable
    X = prepared_data.drop(columns=['HighSatisfaction'])
    y = prepared_data['HighSatisfaction']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Return prepared data
    return X_train, X_test, y_train, y_test

def random_forest():
    X_train, X_test, y_train, y_test = clean_and_prepare_data()
   
    # Initialize and train the Random Forest classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    return model

def tensorflow():
    
    X_train, X_test, y_train, y_test = clean_and_prepare_data()
    
    # Standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize and train a simple neural network
    model = Sequential()
    model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=50, batch_size=10, validation_data=(X_test, y_test), verbose=1)

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    print(f"Neural Network Accuracy: {accuracy}")
    print("Neural Network Classification Report:")
    print(classification_report(y_test, y_pred))
    return model
    
    
def logistic_reg():
    
    X_train, X_test, y_train, y_test = clean_and_prepare_data()
    
    # Initialize and train Logistic Regression model
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Logistic Regression Accuracy: {accuracy}")
    print("Logistic Regression Classification Report:")
    print(report)
    return model

2024-07-23 23:38:41.659426: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
def model(name):
    match name:
        case 'random_forest':
            return random_forest()
        case 'logistic_regression':
            return logistic_reg()
        case 'tensorflow':
            return tensorflow()
        case _:
            print('no model found')

In [3]:
random_forest()
logistic_reg()

Data cleaning and preparation complete.
Accuracy: 0.8766666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93       259
           1       1.00      0.10      0.18        41

    accuracy                           0.88       300
   macro avg       0.94      0.55      0.56       300
weighted avg       0.89      0.88      0.83       300

Data cleaning and preparation complete.
Logistic Regression Accuracy: 0.91
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       259
           1       0.75      0.51      0.61        41

    accuracy                           0.91       300
   macro avg       0.84      0.74      0.78       300
weighted avg       0.90      0.91      0.90       300



In [None]:
import lime
import lime.lime_tabular
import shap


def lime(x_test, x_train, model):
    # Initialize LIME explainer
    explainer = lime.lime_tabular.LimeTabularExplainer(x_train.values, feature_names=x_train.columns, class_names=['Low Satisfaction', 'High Satisfaction'], mode='classification')

    # Explain a prediction
    i = 0  # Index of the test instance to explain
    exp = explainer.explain_instance(x_test.iloc[i].values, model.predict_proba, num_features=10)
    exp.show_in_notebook()

    # Get ranked feature importance
    importance = exp.as_list()
    print("Ranked feature importance (LIME):")
    for feature, weight in importance:
        print(f"{feature}: {weight}")
    return importance
    
def shap(x_test, model):

    # Initialize SHAP explainer
    explainer = shap.TreeExplainer(model)

    # Explain predictions for a subset of the test set
    shap_values = explainer.shap_values(x_test)

    # Plot SHAP values for a single prediction
    shap.initjs()
    shap.force_plot(explainer.expected_value[1], shap_values[1][0], x_test.iloc[0])

    # Summary plot for all predictions
    shap.summary_plot(shap_values[1], x_test)
    single_shap_values = shap_values[1][0]  # Get SHAP values for the first test instance for the positive class
    importance = pd.Series(single_shap_values, index=x_test.columns).sort_values(ascending=False)
    print("Ranked feature importance (SHAP):")
    print(importance)
    return importance


In [None]:
import math
import numpy as np
from scipy.stats import pearsonr
import random
from scipy.stats import kendalltau


def generate_lists(n):
    # Generate a list of numbers from 1 to n
    numbers = list(range(1, n + 1))

    # Generate two random lists
    list1 = random.sample(numbers, len(numbers))
    list2 = list1.copy()

    # Determine how many elements to swap for ~20% mismatch
    num_swaps = round(len(numbers) * 0.2)

    # Swap num_swaps elements in list2
    for _ in range(num_swaps):
        idx1, idx2 = random.sample(range(len(numbers)), 2)
        list2[idx1], list2[idx2] = list2[idx2], list2[idx1]

    return list1, list2


def weightage_calculator(p,d):

    summation_term = 0

    for i in range (1, d): # taking d here will loop upto the value d-1 
        summation_term = summation_term + math.pow(p,i)/i


    Wrbo_1_d = 1 - math.pow(p, d-1) + (((1-p)/p) * d *(np.log(1/(1-p)) - summation_term))

    return Wrbo_1_d



def rbo(S,T, p= 0.9):
    """ Takes two lists S and T of any lengths and gives out the RBO Score
    Parameters
    ----------
    S, T : Lists (str, integers)
    p : Weight parameter, giving the influence of the first d
        elements on the final score. p<0<1. Default 0.9 give the top 10 
        elements 86% of the contribution in the final score.
    
    Returns
    -------
    Float of RBO score
    """
    
    # Fixed Terms
    k = max(len(S), len(T))
    x_k = len(set(S).intersection(set(T)))
    
    summation_term = 0

    # Loop for summation
    # k+1 for the loop to reach the last element (at k) in the bigger list    
    for d in range (1, k+1): 
            # Create sets from the lists
            set1 = set(S[:d]) if d < len(S) else set(S)
            set2 = set(T[:d]) if d < len(T) else set(T)
            
            # Intersection at depth d
            x_d = len(set1.intersection(set2))

            # Agreement at depth d
            a_d = x_d/d   
            
            # Summation
            summation_term = summation_term + math.pow(p, d) * a_d

    # Rank Biased Overlap - extrapolated
    rbo_ext = (x_k/k) * math.pow(p, k) + ((1-p)/p * summation_term)

    return rbo_ext
#print(weightage_calculator(0.9,10))

print(f"Using P = 0.9 and D = 10, the top 10 items in the list attribute {weightage_calculator(0.9,10):.3g}% of the total ranking score")


# Generate a list of numbers from 1 to 20
numbers = list(range(1, 101))

results = []
results10 = []
kendal = []
kendalp = []
kendalT = []
kendal10 = []
kendal10T = []
kendal10p = []


print("Generating 10,000 Random List-Pairs..")
for j in range(1,10001):
    list1 = random.sample(numbers, len(numbers))
    list2 = random.sample(numbers, len(numbers))
    #print(rbo(list1,list2))
    tau, p_value = kendalltau(list1, list2)
    kendal.append(tau)
    kendalT.append((tau + 1) / 2)
    kendalp.append(p_value)
    
    tau, p_value = kendalltau(list1[:10], list2[:10])
    kendal10.append(tau)
    kendal10T.append((tau + 1) / 2)
    kendal10p.append(p_value)
    
    results.append(rbo(list1,list2))
    results10.append(rbo(list1[:10],list2[:10]))
    
print("\n")

print(f"WRBO Randomly Generated Results (Mean): {np.mean(results)}")
print(f"Kendal Randomly Generated Results (Mean): {np.mean(kendal)}")
print(f"Scaled Kendal Randomly Generated Results (Mean): {np.mean(kendalT)}")

print("\n")

print(f"Randomly Generated Results (Mean) Top 10 Only: {np.mean(results10)}")
print(f"Kendal Randomly Generated Results (Mean) Top 10: {np.mean(kendal10)}")
print(f"Scaled Kendal Randomly Generated Results (Mean) Top 10: {np.mean(kendal10T)}")

print("\n")

print("Generating 10,000 Random Uneven Sized List-Pairs (100/90)..")

for j in range(1,10001):
    list1 = random.sample(numbers, len(numbers))
    list2 = random.sample(numbers, len(numbers)-10)
   
    results.append(rbo(list1,list2))
    results10.append(rbo(list1[:10],list2[:10]))
    
print("\n")

print(f"WRBO Randomly Generated Results (Mean): {np.mean(results)}")
print(f"Randomly Generated Results (Mean) Top 10 Only: {np.mean(results10)}")

print("\nNow testing with at least 20% overlap...")


print("Generating 10,000 Random List-Pairs..")
for j in range(1,10001):
    list1, list2 = generate_lists(20)
    #print(rbo(list1,list2))
    tau, p_value = kendalltau(list1, list2)
    kendal.append(tau)
    kendalT.append((tau + 1) / 2)
    kendalp.append(p_value)
    
    tau, p_value = kendalltau(list1[:10], list2[:10])
    kendal10.append(tau)
    kendal10T.append((tau + 1) / 2)
    kendal10p.append(p_value)
    
    results.append(rbo(list1,list2))
    results10.append(rbo(list1[:10],list2[:10]))
    
print("\n")

print(f"WRBO Randomly Generated Results (Mean): {np.mean(results)}")
print(f"Kendal Randomly Generated Results (Mean): {np.mean(kendal)}")
print(f"Scaled Kendal Randomly Generated Results (Mean): {np.mean(kendalT)}")

print("\n")

print(f"Randomly Generated Results (Mean) Top 10 Only: {np.mean(results10)}")
print(f"Kendal Randomly Generated Results (Mean) Top 10: {np.mean(kendal10)}")
print(f"Scaled Kendal Randomly Generated Results (Mean) Top 10: {np.mean(kendal10T)}")