In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# ...
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:
%cd /content/drive/Shared drives/Machine Learning/Recommendation Project/Guest User/

/content/drive/Shared drives/Machine Learning/Recommendation Project/Guest User


### Load and Review the Products Data

In [17]:
import pandas as pd

# List of encodings to try
encodings_to_try = ['utf-8', 'ISO-8859-1', 'latin1']

# Try reading the CSV file with different encodings
for encoding in encodings_to_try:
    try:
        df = pd.read_csv('Labeled_Products_2018.csv', encoding=encoding)
        # If reading is successful, break out of the loop
        break
    except UnicodeDecodeError:
        continue

# Display the first few rows of the DataFrame to inspect the data.
print(df.columns)
# Display counts of each column
column_counts = df.count()
print("\nCounts of Each Column:")
print(column_counts)
df.head

Index(['asin', 'title', 'description', 'price', 'first_imageURLHighRes',
       'Summary', 'year', 'Category'],
      dtype='object')

Counts of Each Column:
asin                     4530
title                    4530
description              4530
price                    4530
first_imageURLHighRes    4530
Summary                  4530
year                     4530
Category                 4530
dtype: int64


<bound method NDFrame.head of             asin                                              title  \
0     B00004U9V2  Crabtree &amp; Evelyn - Gardener's Ultra-Moist...   
1     B00005A77F  Crabtree &amp; Evelyn Hand Soap, Gardeners, 10...   
2     B00005NDTD                                 Soy Milk Hand Crme   
3     B00005V50C                     Supersmile Powdered Mouthrinse   
4     B00005V50B  Supersmile Professional Teeth Whitening Toothp...   
...          ...                                                ...   
4525  B01HBS87ZS  COSMEDIX Simply Brilliant 24/7 Brightening Tre...   
4526  B01HBS7WW2  COSMEDIX Phytoharmony, Balancing Moisturizer, ...   
4527  B01HBS7XP8  COSMEDIX Benefit Balance Antioxidant Infused T...   
4528  B01HGSJPWM  ELEMIS Frangipani Monoi Hand Cream and Nail Cr...   
4529  B01HIIO7Q4  Klorane Conditioner with Pomegranate - Color-T...   

                                            description    price  \
0     ['After a long day of handling thorny situa

In [4]:
from prettytable import PrettyTable

# Assuming 'df' is your DataFrame

# Selecting the first 5 rows and relevant columns
subset_df = df[['asin', 'description']].head(1)

# Creating a PrettyTable object
table = PrettyTable()

# Adding columns to the table
table.field_names = ["ASIN", "Raw Description"]

# Setting up word wrapping for the 'Summary' column
table.align['description'] = 'l'

# Populating the table with data
for _, row in subset_df.iterrows():
    # Wrapping the 'Summary' text
    wrapped_summary = "\n".join([row['description'][i:i + 200] for i in range(0, len(row['description']), 200)])
    table.add_row([row['asin'], wrapped_summary])

# Printing the table
print(table)


+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    ASIN    |                                                                                             Raw Description                                                                                              |
+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| B00004U9V2 | ['After a long day of handling thorny situations, our new hand therapy pump is just the help you need. It contains shea butter as well as extracts of yarrow, clover and calendula to help soothe and co |
|            | ndition work-roughened hands.', 'By Crabtree & Evelyn', 'The aromatic benefits of herbs are varied and far-reachi

### Explore the List of Categories

In [5]:
# Display categories and counts in alphabetical order
category_counts = df['Category'].value_counts()
category_counts_sorted = category_counts.sort_index()

print("Categories and Counts Sorted by First Letter:")
for letter in sorted(set(category[0] for category in category_counts_sorted.index)):
    categories_starting_with_letter = category_counts_sorted[category_counts_sorted.index.str.startswith(letter)]
    print(f"\nStarting with letter '{letter}':")
    print(categories_starting_with_letter)

# Display total count of all categories
total_count = category_counts_sorted.sum()
print(f"\nTotal Count of All Categories: {total_count}")

# Display the number of unique categories
unique_category_count = len(category_counts_sorted)
print(f"\nNumber of Unique Categories: {unique_category_count}")


Categories and Counts Sorted by First Letter:

Starting with letter 'B':
Baby Care: Cleansers, Creams & Lotions         29
Beard & Mustache Care: Oils, Serums & Gels     17
Body Care: Butters & Oils                      25
Body Care: Cleansers                           96
Body Care: Exfoliants & Scrubs                 19
Body Care: Lotions & Mists                    162
Body Care: Mini Kits                           10
Body Care: Neck Creams & Ointments             14
Body Care: Tools & Accessories                 36
Name: Category, dtype: int64

Starting with letter 'E':
Eye Care: Brow Gels, Serums & Pencils      23
Eye Care: Brow Gels, Serums & Pencils      12
Eye Care: Eye Shadows                      47
Eye Care: Eyeliners                        49
Eye Care: Mascara                          37
Eye Care: Serums & Creams                 107
Eye Care: Tools & Accessories              12
Name: Category, dtype: int64

Starting with letter 'F':
Face Care: Cleansers                  145
F

### Implement the NLP Tasks

## 1. NLP Task: Text Cleaning

In [18]:
# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove rows with missing values in relevant columns
df.dropna(subset=['Summary', 'Category'], inplace=True)

# Remove any special characters from the 'summary' column
df['summary_cleaned'] = df['Summary'].str.replace('[^a-zA-Z\s]', '', regex=True)
df['description_cleaned'] = df['description'].str.replace('[^a-zA-Z\s]', '', regex=True)
# Print the count of features
print("Count of Features After Data Cleaning:")
print(df.shape)
column_counts = df.count()
print("\nCounts of Each Column:")
print(column_counts)

Count of Features After Data Cleaning:
(4530, 10)

Counts of Each Column:
asin                     4530
title                    4530
description              4530
price                    4530
first_imageURLHighRes    4530
Summary                  4530
year                     4530
Category                 4530
summary_cleaned          4530
description_cleaned      4530
dtype: int64


In [7]:
from prettytable import PrettyTable

# Assuming 'df' is your DataFrame

# Selecting the first 5 rows and relevant columns
subset_df = df[['asin', 'description_cleaned']].head(1)

# Creating a PrettyTable object
table = PrettyTable()

# Adding columns to the table
table.field_names = ["ASIN", "Cleaned Product Description"]

# Setting up word wrapping for the 'Summary' column
table.align['description_cleaned'] = 'l'

# Populating the table with data
for _, row in subset_df.iterrows():
    # Wrapping the 'Summary' text
    wrapped_summary = "\n".join([row['description_cleaned'][i:i + 200] for i in range(0, len(row['description_cleaned']), 200)])
    table.add_row([row['asin'], wrapped_summary])

# Printing the table
print(table)

+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    ASIN    |                                                                                       Cleaned Product Description                                                                                        |
+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| B00004U9V2 | After a long day of handling thorny situations our new hand therapy pump is just the help you need It contains shea butter as well as extracts of yarrow clover and calendula to help soothe and conditi |
|            | on workroughened hands By Crabtree  Evelyn The aromatic benefits of herbs are varied and farreaching so we combin

## 2. NLP Task: Implementing Tokenization, Stop Words Removal and Lemmatization


In [19]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download the nltk data (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Lemmatization function
def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# Tokenization function
def tokenize_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]

    return tokens

# Apply tokenization to the 'Summary' column
df['summary_tokens'] = df['summary_cleaned'].apply(tokenize_text)

# Apply lemmatization to the tokenized column
df['summary_lemmatized'] = df['summary_tokens'].apply(lemmatize_text)

# Display the DataFrame with the tokenized and lemmatized columns
print(df[['summary_cleaned', 'summary_tokens', 'summary_lemmatized']])


print(df.columns)



df.to_csv("Cleaned_Products_Data.csv")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                        summary_cleaned  \
0     Crabtree amp Evelyn  Gardeners UltraMoisturisi...   
1     Crabtree amp Evelyn Hand Soap Gardeners  fl oz...   
2     Soy Milk Hand Crme Remember why you love your ...   
3     Supersmile Powdered Mouthrinse PSTRONGPlease n...   
4     Supersmile Professional Teeth Whitening Toothp...   
...                                                 ...   
4525  COSMEDIX Simply Brilliant  Brightening Treatme...   
4526  COSMEDIX Phytoharmony Balancing Moisturizer  O...   
4527  COSMEDIX Benefit Balance Antioxidant Infused T...   
4528  ELEMIS Frangipani Monoi Hand Cream and Nail Cr...   
4529  Klorane Conditioner with Pomegranate  ColorTre...   

                                         summary_tokens  \
0     [crabtree, amp, evelyn, gardeners, ultramoistu...   
1     [crabtree, amp, evelyn, hand, soap, gardeners,...   
2     [soy, milk, hand, crme, remember, love, favori...   
3     [supersmile, powdered, mouthrinse, pstrongplea...

In [9]:
from prettytable import PrettyTable
from textwrap import wrap

# Assuming 'df' is your DataFrame
selected_asin = 'B00004U9V2'  # Replace with the specific ASIN you want to analyze

# Create a PrettyTable object
nlp_table = PrettyTable()

# Adding columns to the table
nlp_table.field_names = ["NLP Technique", "Before NLP"]

# Select the row for the specific ASIN
selected_row = df[df['asin'] == selected_asin].iloc[0]

# Tokenization
before_tokenization = selected_row['summary_tokens']

# Lemmatization
before_lemmatization = selected_row['summary_lemmatized']

# Function to wrap text
def wrap_text(text, width=30):
    if isinstance(text, list):
        text = ' '.join(text)
    return "\n".join(wrap(text, width=width))

# Populate the table with wrapped text
nlp_table.add_row(["Tokenization", wrap_text(before_tokenization)])
nlp_table.add_row(["Lemmatization", wrap_text(before_lemmatization)])

# Print the table
print(f"Before NLP for ASIN '{selected_asin}':\n{nlp_table}")

print(f"Before NLP for ASIN '{selected_asin}':\n{nlp_table}")


Before NLP for ASIN 'B00004U9V2':
+---------------+--------------------------------+
| NLP Technique |           Before NLP           |
+---------------+--------------------------------+
|  Tokenization | crabtree amp evelyn gardeners  |
|               | ultramoisturising hand therapy |
|               |  pump g oz long day handling   |
|               |   thorny situations new hand   |
|               |     therapy pump help need     |
|               |   contains shea butter well    |
|               |     extracts yarrow clover     |
|               |     calendula help soothe      |
|               | condition workroughened hands  |
|               |    crabtree evelyn aromatic    |
|               |     benefits herbs varied      |
|               |   farreaching combined whole   |
|               |    bunch one restoratively     |
|               |    fragrant lineup straight    |
|               |     garden weve formulated     |
|               |  gardeners hand therapy myrrh 

In [10]:
from prettytable import PrettyTable
from textwrap import wrap

# Assuming 'df' is your DataFrame
selected_asin = 'B00004U9V2'  # Replace with the specific ASIN you want to analyze

# Create a PrettyTable object
nlp_table = PrettyTable()

# Adding columns to the table
nlp_table.field_names = ["NLP Technique", "Transformed Product Description"]

# Select the row for the specific ASIN
selected_row = df[df['asin'] == selected_asin].iloc[0]

# Tokenization
summary_tokens = selected_row['summary_tokens']
wrapped_summary_tokens = "\n".join(wrap(', '.join(summary_tokens), width=200))
nlp_table.add_row(["Tokenization", wrapped_summary_tokens])

# Lemmatization
summary_lemmatized = selected_row['summary_lemmatized']
wrapped_summary_lemmatized = "\n".join(wrap(', '.join(summary_lemmatized), width=200))
nlp_table.add_row(["Lemmatization", wrapped_summary_lemmatized])

# Set column alignment to 'l' for left-aligned text
nlp_table.align["NLP Technique"] = "l"
nlp_table.align["Output"] = "l"

# Print the table
print(f"NLP Output for ASIN '{selected_asin}':\n{nlp_table}")





NLP Output for ASIN 'B00004U9V2':
+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| NLP Technique |                                                                                     Transformed Product Description                                                                                      |
+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Tokenization  | crabtree, amp, evelyn, gardeners, ultramoisturising, hand, therapy, pump, g, oz, long, day, handling, thorny, situations, new, hand, therapy, pump, help, need, contains, shea, butter, well, extracts,  |
|               |    yarrow, clover, calendula, help, soothe, condition, workrough

# 3. NLP Task: Implementing Text Vectorization

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Apply lemmatization to the tokenized column
df['summary_lemmatized'] = df['summary_tokens'].apply(lemmatize_text)

# Convert the lemmatized tokens back to text
df['summary_lemmatized_text'] = df['summary_lemmatized'].apply(' '.join)

# Apply TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['summary_lemmatized_text'])

# Create a DataFrame from the TF-IDF matrix
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the DataFrame with TF-IDF values
print(tfidf_df)



      aaqua   ab  abaca  abba  abc  abcderm  abdomen  abdominal   ability  \
0       0.0  0.0    0.0   0.0  0.0      0.0      0.0        0.0  0.000000   
1       0.0  0.0    0.0   0.0  0.0      0.0      0.0        0.0  0.000000   
2       0.0  0.0    0.0   0.0  0.0      0.0      0.0        0.0  0.000000   
3       0.0  0.0    0.0   0.0  0.0      0.0      0.0        0.0  0.000000   
4       0.0  0.0    0.0   0.0  0.0      0.0      0.0        0.0  0.017476   
...     ...  ...    ...   ...  ...      ...      ...        ...       ...   
4525    0.0  0.0    0.0   0.0  0.0      0.0      0.0        0.0  0.000000   
4526    0.0  0.0    0.0   0.0  0.0      0.0      0.0        0.0  0.000000   
4527    0.0  0.0    0.0   0.0  0.0      0.0      0.0        0.0  0.000000   
4528    0.0  0.0    0.0   0.0  0.0      0.0      0.0        0.0  0.000000   
4529    0.0  0.0    0.0   0.0  0.0      0.0      0.0        0.0  0.000000   

      ablative  ...  zoma  zone  zooey  zoya  zoyas   zp  zptoni  zuza  \
0

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from prettytable import PrettyTable

# Define the text
text = """
After a long day of handling thorny situations our new hand therapy pump is
just the help you need It contains shea butter as well as extracts of yarrow clover
and calendula to help soothe and condition workroughened hands """
# Create a DataFrame with the text
df = pd.DataFrame({'text': [text]})

# Apply TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['text'])

# Create a DataFrame from the TF-IDF matrix
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Create a PrettyTable object
table = PrettyTable()

# Define the column names
table.field_names = ["TF-IDF Vector"]

# Select the TF-IDF vector for the provided text
tfidf_vector = tfidf_df.iloc[0]

# Convert the TF-IDF vector to a string for display
formatted_vector = ", ".join([f"{word}: {value:.2f}" for word, value in zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vector)])

# Add the TF-IDF vector as a row in the table
table.add_row([formatted_vector])

# Set the table width to allow wrapping
table.max_width = 80  # Adjust the width as needed

# Print the table
print(table)


+----------------------------------------------------------------------------------+
|                                  TF-IDF Vector                                   |
+----------------------------------------------------------------------------------+
|  after: 0.15, and: 0.29, as: 0.29, butter: 0.15, calendula: 0.15, clover: 0.15,  |
|     condition: 0.15, contains: 0.15, day: 0.15, extracts: 0.15, hand: 0.15,      |
|  handling: 0.15, hands: 0.15, help: 0.29, is: 0.15, it: 0.15, just: 0.15, long:  |
|    0.15, need: 0.15, new: 0.15, of: 0.29, our: 0.15, pump: 0.15, shea: 0.15,     |
|   situations: 0.15, soothe: 0.15, the: 0.15, therapy: 0.15, thorny: 0.15, to:    |
|          0.15, well: 0.15, workroughened: 0.15, yarrow: 0.15, you: 0.15          |
+----------------------------------------------------------------------------------+


In [15]:
df.head()

Unnamed: 0,text
0,\nAfter a long day of handling thorny situatio...


### Train the Classification Model

Model 1 Multinomial Naive Bayes

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

# Split the data
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Calculate sample weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Train the classifier with sample weights
clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train, sample_weight=sample_weights)

# Evaluate the classifier
y_pred = clf_nb.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, clf_nb.predict_proba(X_test), multi_class='ovr')
confusion = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("ROC-AUC:", roc_auc)
print("Confusion Matrix:")
print(confusion)


KeyError: ignored

### Save the Trained model and vectoizer for future use

In [None]:
import joblib

# Save the re-trained model
joblib.dump(clf_nb, 'balanced_naive_bayes_model.pkl')

# Save the TF-IDF vectorizer if you've created a new one or made changes
joblib.dump(tfidf_vectorizer, 'balanced_tfidf_vectorizer.pkl')

['balanced_tfidf_vectorizer.pkl']

## Test the Trained Naive Bayes Model


Model 1: Multinomial Naive Bayes

In [None]:
# Assuming vectorizer and classification model 'clf' are already fitted and trained

# Take a sample query
query = "Comb"

# Debugging
print(f"Query before prediction: {query}")

# Transform the query using the same TF-IDF vectorizer used for training the model
query_tfidf = tfidf_vectorizer.transform([query])


print(f"TF-IDF for query: {query_tfidf.toarray()}")


# Predict the category of the query
predicted_category = clf_nb.predict(query_tfidf)

print(f"The query belongs to the category: {predicted_category[0]}")


Query before prediction: Comb
TF-IDF for query: [[0. 0. 0. ... 0. 0. 0.]]
The query belongs to the category: Hair Care: Brushes


Model 2: SVM

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon
from sklearn.svm import SVC

# Define the target variable
y = df['Category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SVM Classifier with Randomized Search
svm_clf = SVC(class_weight='balanced')
random_search = RandomizedSearchCV(svm_clf, {
    'C': expon(scale=10),
    'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train, y_train)
best_clf_svm = random_search.best_estimator_
y_pred_svm = best_clf_svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm, average='weighted')
recall = recall_score(y_test, y_pred_svm, average='weighted')
f1 = f1_score(y_test, y_pred_svm, average='weighted')
# Calculate ROC-AUC using predict_proba

confusion = confusion_matrix(y_test, y_pred_svm)

print("Accuracy (SVM Randomized):", accuracy)
print("Precision (SVM Randomized):", precision)
print("Recall (SVM Randomized):", recall)
print("F1-Score (SVM Randomized):", f1)

print("Confusion Matrix (SVM Randomized):")
print(confusion)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AttributeError: ignored

In [None]:
pip install tabulate



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon
from sklearn.svm import SVC

# Define the target variable
y = df['Category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SVM Classifier with Randomized Search
svm_clf = SVC(class_weight='balanced')
random_search = RandomizedSearchCV(svm_clf, {
    'C': expon(scale=10),
    'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train, y_train)
best_clf_svm = random_search.best_estimator_
y_pred_svm = best_clf_svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm, average='weighted')
recall = recall_score(y_test, y_pred_svm, average='weighted')
f1 = f1_score(y_test, y_pred_svm, average='weighted')
# Calculate ROC-AUC using predict_proba

confusion = confusion_matrix(y_test, y_pred_svm)
print("Accuracy (SVM Randomized):", accuracy)
print("Precision (SVM Randomized):", precision)
print("Recall (SVM Randomized):", recall)
print("F1-Score (SVM Randomized):", f1)

# Print Confusion Matrix
print("Confusion Matrix (SVM Randomized):")
print(confusion)

# Generate and print the Classification Report
class_report = classification_report(y_test, y_pred_svm, target_names=df['Category'].unique(), output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()

print("\nClassification Report (SVM Randomized):\n")
print(tabulate(class_report_df, headers='keys', tablefmt='pretty'))

In [None]:
from sklearn.metrics import classification_report

# ... (previous code remains the same)

# Assuming 'best_clf_svm.pkl' is the file where you saved the model using joblib
best_clf_svm = joblib.load('best_clf_svm.pkl')

# Predict using the loaded model
y_pred_svm = best_clf_svm.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm, average='weighted')
recall = recall_score(y_test, y_pred_svm, average='weighted')
f1 = f1_score(y_test, y_pred_svm, average='weighted')
confusion = confusion_matrix(y_test, y_pred_svm)

# Print the results
print("Accuracy (SVM Randomized):", accuracy)
print("Precision (SVM Randomized):", precision)
print("Recall (SVM Randomized):", recall)
print("F1-Score (SVM Randomized):", f1)

print("Confusion Matrix (SVM Randomized):")
print(confusion)

# Generate and print the classification report
class_report = classification_report(y_test, y_pred_svm)
print("Classification Report (SVM Randomized):")
print(class_report)


Accuracy (SVM Randomized): 0.9293598233995585
Precision (SVM Randomized): 0.9352445301117922
Recall (SVM Randomized): 0.9293598233995585
F1-Score (SVM Randomized): 0.929772172546837
Confusion Matrix (SVM Randomized):
[[ 6  0  0 ...  0  0  0]
 [ 0  3  0 ...  0  0  0]
 [ 0  0  6 ...  0  0  0]
 ...
 [ 0  0  0 ...  1  0  0]
 [ 0  0  0 ...  0 30  0]
 [ 0  0  0 ...  0  0  2]]
Classification Report (SVM Randomized):
                                                 precision    recall  f1-score   support

         Baby Care: Cleansers, Creams & Lotions       0.86      1.00      0.92         6
     Beard & Mustache Care: Oils, Serums & Gels       1.00      1.00      1.00         3
                      Body Care: Butters & Oils       0.86      0.75      0.80         8
                           Body Care: Cleansers       1.00      0.90      0.95        20
                 Body Care: Exfoliants & Scrubs       1.00      1.00      1.00         3
                     Body Care: Lotions & Mists     

In [None]:
best_clf_svm = random_search.best_estimator_
y_pred_svm = best_clf_svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm, average='weighted')
recall = recall_score(y_test, y_pred_svm, average='weighted')
f1 = f1_score(y_test, y_pred_svm, average='weighted')
# Calculate ROC-AUC using predict_proba

confusion = confusion_matrix(y_test, y_pred_svm)

print("Accuracy (SVM Randomized):", accuracy)
print("Precision (SVM Randomized):", precision)
print("Recall (SVM Randomized):", recall)
print("F1-Score (SVM Randomized):", f1)

print("Confusion Matrix (SVM Randomized):")
print(confusion)

Accuracy (SVM Randomized): 0.7284768211920529
Precision (SVM Randomized): 0.7468374722067923
Recall (SVM Randomized): 0.7284768211920529
F1-Score (SVM Randomized): 0.7266401090950444
Confusion Matrix (SVM Randomized):
[[ 6  0  0 ...  0  0  0]
 [ 0  3  0 ...  0  0  0]
 [ 0  0  1 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model 3: Multinomial Logistic Regression

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

# Split the data
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Calculate sample weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Train the classifier with sample weights
clf_lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
clf_lr.fit(X_train, y_train, sample_weight=sample_weights)

# Evaluate the classifier
y_pred = clf_lr.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Confusion Matrix:")
print(confusion)



KeyError: ignored

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

# Split the data
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate sample weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Train the classifier with sample weights
clf_lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
clf_lr.fit(X_train, y_train, sample_weight=sample_weights)

# Evaluate the classifier
y_prob_lr = clf_lr.predict_proba(X_test)

# Choose the positive class probability (for multi-class problems)
# Assuming your target variable is encoded with integers starting from 0
positive_class_index = 1  # Change this according to your problem

# Calculate ROC-AUC score
roc_auc = roc_auc_score(y_test == positive_class_index, y_prob_lr[:, positive_class_index], multi_class='ovr', average='weighted')

# Calculate metrics
y_pred_lr = clf_lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_lr)
precision = precision_score(y_test, y_pred_lr, average='weighted')
recall = recall_score(y_test, y_pred_lr, average='weighted')
f1 = f1_score(y_test, y_pred_lr, average='weighted')
confusion = confusion_matrix(y_test, y_pred_lr)

print("Accuracy (Logistic Regression):", accuracy)
print("Precision (Logistic Regression):", precision)
print("Recall (Logistic Regression):", recall)
print("F1-Score (Logistic Regression):", f1)
print("ROC-AUC Score (Logistic Regression):", roc_auc)

print("Confusion Matrix (Logistic Regression):")
print(confusion)


In [None]:
import joblib

# Save the re-trained model
joblib.dump(clf_lr, 'multinomial_logistic_regression_model.pkl')

# Save the TF-IDF vectorizer if you've created a new one or made changes
joblib.dump(tfidf_vectorizer, 'multinomial_logistic_regression_model.pkl')

['multinomial_logistic_regression_model.pkl']

In [None]:
# Assuming vectorizer and classification model 'clf' are already fitted and trained

# Take a sample query
query = "Comb"

# Debugging
print(f"Query before prediction: {query}")

# Transform the query using the same TF-IDF vectorizer used for training the model
query_tfidf = tfidf_vectorizer.transform([query])


print(f"TF-IDF for query: {query_tfidf.toarray()}")


# Predict the category of the query
predicted_category = clf_lr.predict(query_tfidf)

print(f"The query belongs to the category: {predicted_category[0]}")

Query before prediction: Comb
TF-IDF for query: [[0. 0. 0. ... 0. 0. 0.]]
The query belongs to the category: Hair Care: Brushes


XGBoost Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_sample_weight
import xgboost as xgb
import joblib

# Assuming X and df are defined earlier in your code

# Convert class labels to integers
y = df['Category']
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y)
# Save the LabelEncoder to a file
joblib.dump(label_encoder, 'label_encoder.pkl')
# Split the data
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_train_encoded, test_size=0.2, random_state=42)

# Calculate sample weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_encoded)

# Train the XGBoost classifier with sample weights
clf_xg = xgb.XGBClassifier()
clf_xg.fit(X_train, y_train_encoded, sample_weight=sample_weights)

# Evaluate the classifier
y_pred = clf_xg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.7516556291390728


In [None]:
import joblib

# Save the re-trained model
joblib.dump(clf_xg, 'xgb_model.pkl')



['xgb_model.pkl']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import joblib

# Assuming X and y are defined and hold your features and labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate the Decision Tree Classifier
# You can manually set parameters if you wish, or use default settings
clf_dt = DecisionTreeClassifier(class_weight='balanced')

# Fit the classifier on the training data
clf_dt.fit(X_train, y_train)

# Save the trained model
joblib.dump(clf_dt, 'decision_tree_model.pkl')

# Make predictions on the test set
y_pred = clf_dt.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6225165562913907


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


y = df['Category']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)

# Make predictions
y_pred = clf_rf.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate Precision, Recall, and F1-Score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Calculate Confusion Matrix
confusion = confusion_matrix(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("Confusion Matrix:")
print(confusion)


Accuracy: 0.68
Precision: 0.69
Recall: 0.68
F1-Score: 0.65
Confusion Matrix:
[[ 5  0  0 ...  0  0  0]
 [ 0  2  0 ...  0  0  0]
 [ 0  0  2 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0 28  0]
 [ 0  0  0 ...  0  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Split the data
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)

# Make predictions and probability estimates
y_pred_rf = clf_rf.predict(X_test)
y_prob_rf = clf_rf.predict_proba(X_test)

# Choose the positive class probability (for multi-class problems)
# Assuming your target variable is encoded with integers starting from 0
positive_class_index = 1  # Change this according to your problem

# Calculate ROC-AUC score
roc_auc = roc_auc_score(y_test == positive_class_index, y_prob_rf[:, positive_class_index], multi_class='ovr', average='weighted')

# Calculate Accuracy, Precision, Recall, and F1-Score
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf, average='weighted')
recall = recall_score(y_test, y_pred_rf, average='weighted')
f1 = f1_score(y_test, y_pred_rf, average='weighted')

# Calculate Confusion Matrix
confusion = confusion_matrix(y_test, y_pred_rf)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"ROC-AUC Score: {roc_auc:.2f}")

print("Confusion Matrix:")
print(confusion)


Model Ensembling by Voting mechanism

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Split the data
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create an ensemble of classifiers using soft voting
ensemble_clf = VotingClassifier(estimators=[
    ('xg', clf_xg),
    ('lr', clf_lr)
   # ('svm', best_clf_svm),
    #('rf', clf_rf)
    #('dt', clf_dt)
], voting='soft')

# Train the ensemble
ensemble_clf.fit(X_train, y_train)

# Evaluate the ensemble
y_pred = ensemble_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

AttributeError: ignored

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Split the data
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create individual classifiers
clf_xg = xgb.XGBClassifier()
clf_lr = LogisticRegression()
best_clf_svm = SVC(class_weight='balanced', probability=True)  # Set probability=True

# Create an ensemble of classifiers using soft voting
ensemble_clf = VotingClassifier(estimators=[
    ('xg', clf_xg),
    ('lr', clf_lr),
    ('svm', best_clf_svm)
], voting='soft')

# Train the ensemble
ensemble_clf.fit(X_train, y_train)

# Evaluate the ensemble
y_pred = ensemble_clf.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate Precision, Recall, and F1-Score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Calculate Confusion Matrix
confusion = confusion_matrix(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("Confusion Matrix:")
print(confusion)


Accuracy: 0.74
Precision: 0.75
Recall: 0.74
F1-Score: 0.72
Confusion Matrix:
[[ 3  0  0 ...  0  0  0]
 [ 0  1  0 ...  0  0  0]
 [ 0  0  1 ...  0  1  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0 35  0]
 [ 0  0  0 ...  0  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Split the data
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create individual classifiers
clf_xg = xgb.XGBClassifier()
clf_lr = LogisticRegression()
best_clf_svm = SVC(class_weight='balanced', probability=True)  # Set probability=True

# Create an ensemble of classifiers using soft voting
ensemble_clf = VotingClassifier(estimators=[
    ('xg', clf_xg),
    ('lr', clf_lr),
    ('svm', best_clf_svm)
], voting='soft')

# Train the ensemble
ensemble_clf.fit(X_train, y_train)

# Evaluate the ensemble
y_pred = ensemble_clf.predict(X_test)
y_prob = ensemble_clf.predict_proba(X_test)

# Choose the positive class probability (for multi-class problems)
# Assuming your target variable is encoded with integers starting from 0
positive_class_index = 1  # Change this according to your problem

# Calculate ROC-AUC score
roc_auc = roc_auc_score(y_test == positive_class_index, y_prob[:, positive_class_index], multi_class='ovr', average='weighted')

# Calculate Accuracy, Precision, Recall, and F1-Score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Calculate Confusion Matrix
confusion = confusion_matrix(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"ROC-AUC Score: {roc_auc:.2f}")

print("Confusion Matrix:")
print(confusion)


NameError: ignored

### Load the saved models

In [None]:
from google.colab import drive
import joblib
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# 1. Load Saved Models
classification_model = joblib.load('xgb_model.pkl')
text_vectorizer = joblib.load('balanced_tfidf_vectorizer.pkl')
# Load the LabelEncoder
label_encoder = joblib.load('label_encoder.pkl')


### Implement the Recommendation Logic and Test the Real time query

In [None]:

from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
import joblib



# 2. User Query
query = input("Enter your search query: ")

# 3. Predict Category
query_tfidf = text_vectorizer.transform([query])
#predicted_category = classification_model.predict(query_tfidf)[0]
#print(f"The query belongs to the category: {predicted_category}")

# 4. Load the DataFrame
df = pd.read_csv('Cleaned_Products_Data.csv')


label_encoder = LabelEncoder()
label_encoder.fit(df['Category'])
# Predict the category of the query
predicted_category_encoded = classification_model.predict(query_tfidf)

# Decode the predicted category back to original category name
predicted_category = label_encoder.inverse_transform(predicted_category_encoded)

print(f"The query belongs to the category: {predicted_category[0]}")



# Generate TF-IDF matrix for the loaded data
X = text_vectorizer.transform(df['summary_lemmatized'])

# 5. Filter Products by Predicted Category
filtered_df = df[df['Category'] == predicted_category[0]]


# 6. Compute Similarity
# Filter your TF-IDF matrix to only include vectors in the filtered_df
#target_tfidf_matrix = X[df['Category'] == predicted_category]
target_tfidf_matrix = X[df['Category'] == predicted_category[0]]
# Compute the cosine similarity scores
query_similarity_scores = cosine_similarity(query_tfidf, target_tfidf_matrix)

# 7. Top-N Recommendations
N = 5  # Change N as needed
top_N_indices = np.argsort(query_similarity_scores[0])[-N:][::-1]
top_N_scores = query_similarity_scores[0][top_N_indices]

top_N_recommendations = filtered_df.iloc[top_N_indices][['title', 'price']]

# Show the top N recommendations
top_N_recommendations['similarity_score'] = top_N_scores

from prettytable import PrettyTable

# ... [rest of your code for computing top_N_recommendations] ...

# Create a PrettyTable object
table = PrettyTable()

# Define the column names
table.field_names = ["Title", "Price", "Category", "Similarity Score"]

# Add rows to the table
for _, row in top_N_recommendations.iterrows():
    table.add_row([row['title'], row['price'], predicted_category, round(row['similarity_score'], 4)])
print("Top N recommendations with similarity scores:")
print(table)
print("Top N recommendations:")
print(top_N_recommendations)
top_N_recommendations.to_csv('reco_data.csv', index=False)




Enter your search query: Glo Minerals Moist Hydration Mist 2.0 oz
The query belongs to the category: Face Care: Serums, Mists & Toners
Top N recommendations with similarity scores:
+--------------------------------------------------------------------------------------------------------+---------+---------------------------------------+------------------+
|                                                 Title                                                  |  Price  |                Category               | Similarity Score |
+--------------------------------------------------------------------------------------------------------+---------+---------------------------------------+------------------+
|                                Glo Minerals Moist Hydration Mist 2.0 oz                                | $22.00  | ['Face Care: Serums, Mists & Toners'] |      0.5482      |
|                             glo Minerals Revive Hydration Mist, 2 fl. oz.                              | $20.00  

In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib

# ... [rest of your imports and setup] ...

# 2. User Query
query = input("Enter your search query: ")

# 3. Predict Category
query_tfidf = text_vectorizer.transform([query])

# 4. Load the DataFrame
df = pd.read_csv('Cleaned_Products_Data.csv')

# Initialize and fit LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(df['Category'])

# Predict the category of the query
predicted_category_encoded = classification_model.predict(query_tfidf)

# Decode the predicted category back to original category name
predicted_category = label_encoder.inverse_transform(predicted_category_encoded)
print(f"The query belongs to the category: {predicted_category[0]}")

# Generate TF-IDF matrix for the loaded data
X = text_vectorizer.transform(df['summary_lemmatized'])

# 5. Filter Products by Predicted Category
filtered_df = df[df['Category'] == predicted_category[0]]

# 6. Compute Similarity
# Filter your TF-IDF matrix to only include vectors in the filtered_df
target_tfidf_matrix = X[df['Category'] == predicted_category[0]]

# Compute the cosine similarity scores
query_similarity_scores = cosine_similarity(query_tfidf, target_tfidf_matrix)

# 7. Top-N Recommendations
N = 6  # Change N as needed
top_N_indices = np.argsort(query_similarity_scores[0])[-N:][::-1]
top_N_scores = query_similarity_scores[0][top_N_indices]

# Exclude the user query from recommendations
# Assuming 'title' column contains product titles
top_N_recommendations = filtered_df.iloc[top_N_indices]
top_N_recommendations = top_N_recommendations[top_N_recommendations['title'].str.lower() != query.lower()][['title', 'price']]

# Show the top N recommendations
top_N_recommendations['similarity_score'] = top_N_scores[:len(top_N_recommendations)]

from prettytable import PrettyTable

# Create a PrettyTable object
table = PrettyTable()

# Define the column names
table.field_names = ["Title", "Price", "Category", "Similarity Score"]

# Add rows to the table
for _, row in top_N_recommendations.iterrows():
    table.add_row([row['title'], row['price'], predicted_category[0], round(row['similarity_score'], 4)])
print("Top N recommendations with similarity scores:")
print(table)
print("Top N recommendations:")
print(top_N_recommendations)
top_N_recommendations.to_csv('reco_data.csv', index=False)


Enter your search query: Glo Minerals Moist Hydration Mist 2.0 oz
The query belongs to the category: Face Care: Serums, Mists & Toners
Top N recommendations with similarity scores:
+--------------------------------------------------------------------------------------------------------+---------+-----------------------------------+------------------+
|                                                 Title                                                  |  Price  |              Category             | Similarity Score |
+--------------------------------------------------------------------------------------------------------+---------+-----------------------------------+------------------+
|                             glo Minerals Revive Hydration Mist, 2 fl. oz.                              | $20.00  | Face Care: Serums, Mists & Toners |      0.5482      |
|                               NEUMA neuSmooth Illuminating Mist, 2.5 oz.                               | $22.00  | Face Care: Ser

In [None]:
import joblib
from sklearn.metrics import classification_report

# Load the saved model
ensemble_model = joblib.load('ensemble_model.pkl')

# Assuming you have your test data and labels (X_test, y_test)
# Replace X_test and y_test with your actual test data and labels

# Make predictions
y_pred = ensemble_model.predict(X_test)

# Get the classification report
class_report = classification_report(y_test, y_pred)

# Print the classification report
print(class_report)


                                                 precision    recall  f1-score   support

         Baby Care: Cleansers, Creams & Lotions       1.00      0.57      0.73         7
     Beard & Mustache Care: Oils, Serums & Gels       0.00      0.00      0.00         7
                      Body Care: Butters & Oils       0.00      0.00      0.00         5
                           Body Care: Cleansers       0.00      0.00      0.00        18
                 Body Care: Exfoliants & Scrubs       0.00      0.00      0.00         6
                     Body Care: Lotions & Mists       0.80      0.15      0.26        26
                           Body Care: Mini Kits       0.00      0.00      0.00         1
             Body Care: Neck Creams & Ointments       0.00      0.00      0.00         1
                 Body Care: Tools & Accessories       0.00      0.00      0.00         5
          Eye Care: Brow Gels, Serums & Pencils       0.00      0.00      0.00         2
         Eye Care: B

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
