In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
from google.colab import drive

drive.mount("/content/drive")

def recommend_coffees(user_ranks):
    # Define columns
    numerical_cols = ['rating', 'acidity_structure', 'aftertaste', 'aroma', 'body', 'flavor', 'with_milk']
    categorical_cols = ['roast_level', 'price_category']

    # Load the datasets
    file_path = '/content/drive/MyDrive/Cleaned_Data_onehot.csv'
    df_cleaned_onehot = pd.read_csv(file_path)
    file_path = '/content/drive/MyDrive/Cleaned_Data.csv'
    df_cleaned = pd.read_csv(file_path)

    # Preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(), categorical_cols)
        ]
    )
    X_transformed = preprocessor.fit_transform(df_cleaned_onehot)
    one_hot_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
    all_columns = numerical_cols + list(one_hot_columns)
    df_transformed = pd.DataFrame(X_transformed, columns=all_columns)
    df_transformed_1 = pd.concat([df_transformed, df_cleaned['title']], axis=1)

    # PCA for dimensionality reduction
    pca = PCA(n_components=0.95)
    df_reduced = pca.fit_transform(df_transformed)

    # Determine the optimal number of clusters
    range_n_clusters = [3, 4, 5, 7, 9]
    best_n_clusters = 0
    best_silhouette = -1

    for n_clusters in range_n_clusters:
        kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
        clusters = kmeans.fit_predict(df_reduced)
        silhouette_avg = silhouette_score(df_reduced, clusters)

        if silhouette_avg > best_silhouette:
            best_silhouette = silhouette_avg
            best_n_clusters = n_clusters

    # Perform K-Means with the best number of clusters
    kmeans = KMeans(n_clusters=best_n_clusters, init='k-means++', random_state=112)
    best_clusters = kmeans.fit_predict(df_reduced)
    df_transformed_1['best_cluster'] = best_clusters

    # User Preferences
    user_conditions = get_user_conditions(user_ranks, numerical_cols, categorical_cols)
    user_preferences = get_user_preferences_from_conditions(user_conditions, numerical_cols, categorical_cols)
    user_df = pd.DataFrame([user_preferences])

    # Start by ensuring all numerical columns are present
    for col in numerical_cols:
        if col not in user_df.columns:
            user_df[col] = 0

    # Apply one-hot encoding to categorical columns if they were one-hot encoded in the training data
    user_df_encoded = pd.get_dummies(user_df, columns=categorical_cols)

    # Add missing one-hot encoded columns with default values (0)
    for col in df_transformed.columns:
        if col not in user_df_encoded.columns:
            user_df_encoded[col] = 0

    # Ensure the order of columns in user_df_encoded matches that of df_transformed
    user_df_encoded = user_df_encoded[df_transformed.columns]

    # Only apply StandardScaler to numerical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols)
        ], remainder='passthrough'  # Include this to keep other columns unchanged
    )

    preprocessor.fit(df_transformed)
    user_transformed = preprocessor.transform(user_df_encoded)

    # Apply PCA
    new_user_pca = pca.transform(user_transformed)

    # Predict the cluster
    user_cluster = kmeans.predict(new_user_pca)
    selected_cluster = user_cluster[0]

    # Filter coffees from the selected cluster
    vendor_coffees = df_transformed_1[df_transformed_1['best_cluster'] == selected_cluster]

    # Transform the vendor coffees using the same preprocessor
    vendor_coffees_transformed = preprocessor.transform(vendor_coffees.iloc[:, :-1])  # Exclude 'title' column

    # Calculate similarity and rank coffees
    distances = cdist(user_transformed, vendor_coffees_transformed, metric='euclidean')
    vendor_coffees['similarity'] = distances.flatten()

    # Get the top 15 recommendations
    top_recommendations = vendor_coffees.nsmallest(15, 'similarity')

    return top_recommendations[['title', 'similarity']]

def get_user_conditions(user_ranks, numerical_cols, categorical_cols):
    rank_to_greater_than = {
        1: 2,
        2: 1,
        3: -1,
        4: -2,
        5: -3
    }

    conditions = {}
    for feature in numerical_cols:
        rank = user_ranks.get(feature, 0)
        conditions[feature] = {'greater_than': rank_to_greater_than.get(rank, 0)}

    for feature in categorical_cols:
        conditions[feature] = 'Medium-Dark' if feature == 'roast_level' else 'Low Price'

    return conditions

def get_user_preferences_from_conditions(conditions, numerical_cols, categorical_cols):
    user_preferences = {}

    for col, condition in conditions.items():
        if 'greater_than' in condition:
            user_preferences[col] = condition['greater_than']
        elif 'equal_to' in condition:
            user_preferences[col] = condition['equal_to']

    for col in categorical_cols:
        if col in conditions:
            user_preferences[col] = conditions[col]

    return user_preferences

# Example usage
user_ranks = {'rating': 1, 'acidity_structure': 2, 'aftertaste': 3, 'aroma': 4, 'body': 5}
top_15_recommendations = recommend_coffees(user_ranks)
print(top_15_recommendations)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




                               title  similarity
7013                    Vienna Roast    2.563450
5103    Kenyan Highland Cooperatives    2.672484
6558            Ethiopia Yirgacheffe    2.870113
6994                   Owner Reserve    2.876138
7004  Bayview Farms Kona Extra Fancy    2.894531
7011   Guatemala Finca Bella Carmona    2.913281
6243                       Guatemala    3.096443
6777       India Jumboor Estate MNEB    3.096443
6835                        Kenya AA    3.096443
6270          Timor Maubesse Organic    3.113271
6338              Deluxe House Blend    3.113271
6932       Yuletide Mocha-Java Decaf    3.120438
6355             Windsor Court Blend    3.153536
6814            Ethiopia Yirgacheffe    3.153536
7006               Kenya Nyaithee AA    3.167644


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vendor_coffees['similarity'] = distances.flatten()
