k-means *clustering*

In [None]:
# Import necessary libraries
import pandas as pd
import joblib
from google.colab import drive
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import silhouette_score
import csv
import os
import numpy as np

# Mount Google Drive to access and save files
drive.mount('/content/gdrive')

# Define global variables for TF-IDF transformers
title_tfidf = None
category_tfidf = None

# Transform input data using pre-trained TF-IDF transformers
def transform_input(title, category, title_tfidf, category_tfidf):
    title_transformed = title_tfidf.transform([title])
    category_transformed = category_tfidf.transform([category])
    return hstack([title_transformed, category_transformed])

# Train the KMeans clustering model
def train_model(train_data, val_data):
    global title_tfidf, category_tfidf  # Declare as global variables

    # Combine training and validation datasets for training
    combined_data = pd.concat([train_data, val_data], axis=0, ignore_index=True)

    # Split your combined data into features (X) and labels (y)
    X_combined = combined_data[['title', 'categoryName']]

    # Feature extraction using TF-IDF for title and categoryName
    title_tfidf = TfidfVectorizer().fit(X_combined['title'])
    category_tfidf = TfidfVectorizer().fit(X_combined['categoryName'])

    # Combine features into a single matrix for training
    title_transformed = title_tfidf.transform(X_combined['title'])
    category_transformed = category_tfidf.transform(X_combined['categoryName'])
    X_combined_transformed = hstack([title_transformed, category_transformed])

    # Choose the number of clusters
    num_clusters = 10
    model = make_pipeline(KMeans(n_clusters=num_clusters))

    # Train the model on the combined data
    model.fit(X_combined_transformed)

    # Save the fine-tuned model
    joblib.dump((model, title_tfidf, category_tfidf), '/content/gdrive/My Drive/Recommendation/Datasets/k_means_model.joblib')

    # Evaluate the model on the validation set
    X_val_transformed = hstack([title_tfidf.transform(val_data['title']), category_tfidf.transform(val_data['categoryName'])])
    predictions = model.predict(X_val_transformed)

    # Calculate the Silhouette Score which measures how well-defined the clusters are in the data.
    # It ranges from -1 to 1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.
    silhouette_avg = silhouette_score(X_combined_transformed, model.named_steps['kmeans'].labels_)
    print(f"Silhouette Score: {silhouette_avg}")


# Recommend products based on the input
def recommend_product(title, category, model_path, train_data, X_combined_transformed, num_recommendations=1):
    # Load the pre-trained model
    model, title_tfidf, category_tfidf = joblib.load(model_path)

    # Transform input data using the TF-IDF transformers
    input_features = transform_input(title, category, title_tfidf, category_tfidf)

    # Predict the cluster for the input data
    cluster = model.named_steps['kmeans'].predict(input_features)

    # Find products in the same cluster as the input data
    labels = model.named_steps['kmeans'].labels_
    cluster_indices = np.where(labels == cluster[0])[0]

    # Calculate the cosine similarity between the input product and each product in the cluster and sort them based on these similarity scores.
    # The resulting recommended_titles list will contain the best options within the same cluster.
    similarity_scores = []
    for index in cluster_indices:
        similarity_score = np.dot(input_features.toarray(), X_combined_transformed[index].toarray().T)[0, 0]
        similarity_scores.append((index, similarity_score))

    # Sort products based on similarity scores in descending order
    sorted_products = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Extract recommended titles
    recommended_titles = [train_data.iloc[index]['title'] for index, _ in sorted_products]

    # Return only the top N recommendations
    return recommended_titles[:num_recommendations]

# Make recommendations updating the CSV file
def get_recommendations(model_path, train_data, recommendation_column):
    file_path = "/content/gdrive/My Drive/Recommendation/Datasets/recommendations.csv"

    # Create a temporary file to write the updated data
    temp_file_path = "/content/gdrive/My Drive/Recommendation/Datasets/temp_recommendations.csv"

    with open(file_path, 'r') as infile, open(temp_file_path, 'w', newline='') as outfile:
        csv_reader = csv.DictReader(infile)
        fieldnames = csv_reader.fieldnames  # Retrieve the existing field names

        # Add the recommendation_column to the list of fieldnames if not already present
        if recommendation_column not in fieldnames:
            fieldnames.append(recommendation_column)

        csv_writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        csv_writer.writeheader()

        for row in csv_reader:
            title = row['title']
            categoryName = row['categoryName']
            recommended_titles = recommend_product(title, categoryName, model_path, train_data, X_combined_transformed, 1)
            prediction_value = recommended_titles[0]

            # Update the existing 'kMeansRecommendation' column in each row
            row[recommendation_column] = prediction_value
            csv_writer.writerow(row)

    # Replace the original file with the temporary file
    os.replace(temp_file_path, file_path)

# Load the training and validation datasets
train_data = pd.read_csv('/content/gdrive/My Drive/Recommendation/Datasets/train_set.csv')
validation_data = pd.read_csv('/content/gdrive/My Drive/Recommendation/Datasets/validation_set.csv')

# Train the model with both training and validation datasets
train_model(train_data, validation_data)

# train_data includes both training and validation data
train_data_combined = pd.concat([train_data, validation_data], ignore_index=True)
X_combined_transformed = hstack([
    title_tfidf.transform(train_data_combined['title']),
    category_tfidf.transform(train_data_combined['categoryName'])
])

# Call the function with the appropriate arguments
get_recommendations('/content/gdrive/My Drive/Recommendation/Datasets/k_means_model.joblib', train_data_combined, 'kMeansRecommendation')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).




Silhouette Score: 0.12953924791119417


Content-Based Filtering (CBF)

Content-Based Filtering doesn't involve a separate training phase like collaborative filtering or some machine learning models. Instead, it relies on creating representations of items (in this case, product titles and categories) and making recommendations based on the similarity of these representations.

In the provided code, the training process involves creating TF-IDF representations for product titles and categories using TfidfVectorizer. The vectorizers (title_tfidf and category_tfidf) are then used to transform the training data during recommendation.

In these functions, the TF-IDF vectorizers (title_tfidf and category_tfidf) are used to transform input titles, categories, and titles/categories in the training data into numerical feature vectors. The recommend_product_cbf function then calculates the similarity scores using these feature vectors for recommendation.

In [None]:
import pandas as pd
import joblib
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import csv
import os
import numpy as np

# Mount Google Drive to access and save files
drive.mount('/content/gdrive')

# Transform input data using pre-trained TF-IDF transformers
# transform_input: This function takes a product title, category, and the trained TF-IDF vectorizers (title_tfidf and category_tfidf) and transforms the input title and category into a feature vector that can be used for recommendation.
def transform_input(title, category, title_tfidf, category_tfidf):
    title_transformed = title_tfidf.transform([title])
    category_transformed = category_tfidf.transform([category])
    return np.hstack([title_transformed.toarray(), category_transformed.toarray()])

# Train the TF-IDF transformers
def train_tfidf_transformers(train_data):
    title_tfidf = TfidfVectorizer().fit(train_data['title'])
    category_tfidf = TfidfVectorizer().fit(train_data['categoryName'])
    return title_tfidf, category_tfidf

# Recommend products based on the input using Content-Based Filtering
# recommend_product_cbf: This function takes an input title, category, and the TF-IDF vectorizers along with the training data. It calculates the cosine similarity between the input product and each product in the training data based on their TF-IDF representations. The products are then sorted by similarity scores, and a list of recommended titles is returned.
def recommend_product_cbf(title, category, title_tfidf, category_tfidf, train_data, num_recommendations=1):
    # Transform input data using the TF-IDF transformers
    input_features = transform_input(title, category, title_tfidf, category_tfidf)

    # Transform titles and categories in the training data
    train_data_transformed = np.hstack([
        title_tfidf.transform(train_data['title']).toarray(),
        category_tfidf.transform(train_data['categoryName']).toarray()
    ])

    # Calculate cosine similarity between the input product and each product in the training data
    similarity_scores = linear_kernel(input_features.reshape(1, -1), train_data_transformed).flatten()

    # Sort products based on similarity scores in descending order
    sorted_products = np.argsort(similarity_scores)[::-1]

    # Extract recommended titles
    recommended_titles = [train_data.iloc[index]['title'] for index in sorted_products]

    # Return only the top N recommendations
    return recommended_titles[:num_recommendations]

# Make recommendations updating the CSV file using Content-Based Filtering
# get_cbf_recommendations: This function updates the CSV file with recommendations based on Content-Based Filtering. It uses the recommend_product_cbf function to get recommendations for each product in the file.
def get_cbf_recommendations(train_data, recommendation_column):
    file_path = "/content/gdrive/My Drive/Recommendation/Datasets/recommendations.csv"

    # Create a temporary file to write the updated data
    temp_file_path = "/content/gdrive/My Drive/Recommendation/Datasets/temp_recommendations.csv"

    # Train TF-IDF transformers
    title_tfidf, category_tfidf = train_tfidf_transformers(train_data)

    with open(file_path, 'r') as infile, open(temp_file_path, 'w', newline='') as outfile:
        csv_reader = csv.DictReader(infile)
        fieldnames = csv_reader.fieldnames  # Retrieve the existing field names

        # Add the recommendation_column to the list of fieldnames if not already present
        if recommendation_column not in fieldnames:
            fieldnames.append(recommendation_column)

        csv_writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        csv_writer.writeheader()

        for row in csv_reader:
            title = row['title']
            category = row['categoryName']
            recommended_titles = recommend_product_cbf(title, category, title_tfidf, category_tfidf, train_data, 1)
            prediction_value = recommended_titles[0]

            # Update the existing 'cbfRecommendation' column in each row
            row[recommendation_column] = prediction_value
            csv_writer.writerow(row)

    # Replace the original file with the temporary file
    os.replace(temp_file_path, file_path)

# Load the training and validation datasets
train_data = pd.read_csv('/content/gdrive/My Drive/Recommendation/Datasets/train_set.csv')
validation_data = pd.read_csv('/content/gdrive/My Drive/Recommendation/Datasets/validation_set.csv')

# Call the function with the appropriate arguments
get_cbf_recommendations(train_data, 'cbfRecommendation')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Hierarchical Clustering

In [2]:
import pandas as pd
import joblib
from google.colab import drive
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import csv
import os
import numpy as np

# Mount Google Drive to access and save files
drive.mount('/content/gdrive')

# Define global variables for TF-IDF transformers
title_tfidf = None
category_tfidf = None

# Transform input data using pre-trained TF-IDF transformers
def transform_input(title, category, title_tfidf, category_tfidf):
    title_transformed = title_tfidf.transform([title])
    category_transformed = category_tfidf.transform([category])
    return hstack([title_transformed, category_transformed])

# Train the Hierarchical Clustering model
def train_model(train_data, val_data):
    global title_tfidf, category_tfidf  # Declare as global variables

    # Combine training and validation datasets for training
    combined_data = pd.concat([train_data, val_data], axis=0, ignore_index=True)

    # Split your combined data into features (X) and labels (y)
    X_combined = combined_data[['title', 'categoryName']]

    # Feature extraction using TF-IDF for title and categoryName
    title_tfidf = TfidfVectorizer().fit(X_combined['title'])
    category_tfidf = TfidfVectorizer().fit(X_combined['categoryName'])

    # Combine features into a single matrix for training
    title_transformed = title_tfidf.transform(X_combined['title'])
    category_transformed = category_tfidf.transform(X_combined['categoryName'])
    X_combined_transformed = hstack([title_transformed, category_transformed])

    # Choose the number of clusters (make sure it's at least 2)
    num_clusters = 2
    model = AgglomerativeClustering(n_clusters=num_clusters)

    # Train the model on the combined data
    model.fit(X_combined_transformed.toarray())

    # Save the fine-tuned model
    joblib.dump((model, title_tfidf, category_tfidf), '/content/gdrive/My Drive/Recommendation/Datasets/hierarchical_model.joblib')

    # Evaluate the model on the validation set
    predictions = model.labels_

    # Calculate the Silhouette Score
    silhouette_avg = silhouette_score(X_combined_transformed.toarray(), predictions)
    print(f"Silhouette Score: {silhouette_avg}")

# Recommend products based on the input
def recommend_product(title, category, model, train_data_combined, X_combined_transformed, num_recommendations=1):
    # Transform input data using the TF-IDF transformers
    input_features = transform_input(title, category, title_tfidf, category_tfidf)

    # Predict the cluster for the input data
    cluster = model.labels_

    # Find products in the same cluster as the input data
    cluster_indices = np.where(cluster == cluster[0])[0]

    # Calculate the cosine similarity between the input product and each product in the cluster and sort them based on these similarity scores.
    # The resulting recommended_titles list will contain the best options within the same cluster.
    similarity_scores = []
    for index in cluster_indices:
        similarity_score = np.dot(input_features.toarray(), X_combined_transformed[index].toarray().T)[0, 0]
        similarity_scores.append((index, similarity_score))

    # Sort products based on similarity scores in descending order
    sorted_products = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Extract recommended titles
    recommended_titles = [train_data_combined.iloc[index]['title'] for index, _ in sorted_products]

    # Return only the top N recommendations
    return recommended_titles[:num_recommendations]

# Make recommendations updating the CSV file
def get_recommendations(model_path, train_data_combined, recommendation_column, X_combined_transformed):
    # Load the pre-trained model
    model, title_tfidf, category_tfidf = joblib.load(model_path)

    # Call the function with the appropriate arguments
    recommended_titles = recommend_product(
        train_data_combined['title'][0],  # Replace 0 with the index of the specific row you want to recommend for
        train_data_combined['categoryName'][0],  # Replace 0 with the index of the specific row you want to recommend for
        model,
        train_data_combined,
        X_combined_transformed,
        1
    )

    print(f'Recommended titles: {recommended_titles}')

    file_path = "/content/gdrive/My Drive/Recommendation/Datasets/recommendations.csv"

    # Create a temporary file to write the updated data
    temp_file_path = "/content/gdrive/My Drive/Recommendation/Datasets/temp_recommendations.csv"

    with open(file_path, 'r') as infile, open(temp_file_path, 'w', newline='') as outfile:
        csv_reader = csv.DictReader(infile)
        fieldnames = csv_reader.fieldnames  # Retrieve the existing field names

        # Add the recommendation_column to the list of fieldnames if not already present
        if recommendation_column not in fieldnames:
            fieldnames.append(recommendation_column)

        csv_writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        csv_writer.writeheader()

        for row in csv_reader:
            title = row['title']
            categoryName = row['categoryName']
            recommended_titles = recommend_product(title, categoryName, model, train_data_combined, X_combined_transformed, 1)
            prediction_value = recommended_titles[0]

            # Update the existing 'hierarchicalRecommendation' column in each row
            row[recommendation_column] = prediction_value
            csv_writer.writerow(row)

    # Replace the original file with the temporary file
    os.replace(temp_file_path, file_path)

# Load the training and validation datasets
train_data = pd.read_csv('/content/gdrive/My Drive/Recommendation/Datasets/train_set.csv')
validation_data = pd.read_csv('/content/gdrive/My Drive/Recommendation/Datasets/validation_set.csv')

# Train the model with both training and validation datasets
train_model(train_data, validation_data)

# train_data includes both training and validation data
train_data_combined = pd.concat([train_data, validation_data], ignore_index=True)
X_combined_transformed = hstack([
    title_tfidf.transform(train_data_combined['title']),
    category_tfidf.transform(train_data_combined['categoryName'])
])

# Call the function with the appropriate arguments
get_recommendations('/content/gdrive/My Drive/Recommendation/Datasets/hierarchical_model.joblib', train_data_combined, 'hierarchicalRecommendation', X_combined_transformed)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Silhouette Score: 0.11334528177046156
Recommended titles: ['HYZUO Laptop Backpack with USB Charging Port Anti-Theft Water Resistant Slim Stylish College School Backpack Business Travel Bag Fits Up to 15.6 Inch Laptop for Men and Women, Light Grey']
