In [1]:
import torch
import os
import requests
from PIL import Image, UnidentifiedImageError
from io import BytesIO
import pandas as pd
import pickle
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torchvision import models, transforms

nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv("A2_Data.csv")

# Create a new DataFrame to hold the expanded rows
expanded_rows = []

# Iterate through each row in the original DataFrame
for index, row in df.iterrows():
    # Extract the ID and Review Text from the current row
    id_value = row['ID']
    review_text = row['Review Text']

    # Split the Image links into separate rows
    for image_link in eval(row['Image']):  # Use eval to convert string representation of list to an actual list
        expanded_row = {
            'ID': id_value,
            'Image': image_link,
            'Review Text': review_text
        }
        expanded_rows.append(expanded_row)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Define image preprocessing
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),  # Adjust to ResNet input size (224x224)
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load a pre-trained ResNet model
resnet = models.resnet50(pretrained=True)
resnet.eval()  # Set the model to evaluation mode

# Function to extract features from an image using ResNet
def extract_image_features_resnet(url):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img_t = image_transforms(img)
        img_t = img_t.unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            features = resnet(img_t)
        return features.cpu().numpy().flatten()
    except requests.exceptions.RequestException as e:
        print(f"RequestException for URL {url}: {e}")
    except UnidentifiedImageError:
        print(f"UnidentifiedImageError: cannot identify image file from URL {url}. Removing entry.")
        return None  # Return None to indicate the image could not be processed
    except Exception as e:
        print(f"Unexpected error for URL {url}: {e}")
    return None

# List to store indices of rows to be removed
rows_to_remove = []

# Extract features using ResNet
image_features_resnet = []

for index, row in expanded_df.iterrows():
    # Check if the 'Image' column has a valid URL
    if pd.notna(row['Image']):
        image_feature = extract_image_features_resnet(row['Image'])
        if image_feature is not None:
            image_features_resnet.append(image_feature)
        else:
            # If the image could not be processed, mark the row for removal
            rows_to_remove.append(index)

# Normalize the extracted features using ResNet
#image_features_resnet = np.array(image_features_resnet)
#mean_resnet = np.mean(image_features_resnet, axis=0)
#std_resnet = np.std(image_features_resnet, axis=0)
#normalized_features_resnet = (image_features_resnet - mean_resnet) / std_resnet

# Save the normalized features
#with open('normalized_features_resnet.pkl', 'wb') as file:
    #pickle.dump(normalized_features_resnet, file)

# Remove rows marked for removal
#expanded_df = expanded_df.drop(rows_to_remove, axis=0).reset_index(drop=True)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/71F3npeHUDL._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/71wHUWncMGL._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/71B8OOE5N8L._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/81SX3oAWbNL._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/718niQ1GEwL._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/61OboZT-kcL._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/710

In [2]:
# Remove rows marked for removal
cleaned_df = expanded_df.drop(rows_to_remove, inplace=False).reset_index(drop=True)

# Save the cleaned DataFrame
#cleaned_df.to_csv("cleaned_A2_Data.csv", index=False)

# Save the results using ResNet
with open('normalized_features_resnet.pkl', 'wb') as f:
    pickle.dump(cleaned_df, f)


In [3]:
print(len(image_features_resnet))




1640


In [4]:
# Assuming cleaned_df and image_features_resnet have the same length
cleaned_df["Image features"] = image_features_resnet

# Display the DataFrame
print(cleaned_df)

        ID                                              Image  \
0     3452  https://images-na.ssl-images-amazon.com/images...   
1     1205  https://images-na.ssl-images-amazon.com/images...   
2     1205  https://images-na.ssl-images-amazon.com/images...   
3     1205  https://images-na.ssl-images-amazon.com/images...   
4     1708  https://images-na.ssl-images-amazon.com/images...   
...    ...                                                ...   
1635  1882  https://images-na.ssl-images-amazon.com/images...   
1636  1547  https://images-na.ssl-images-amazon.com/images...   
1637  1547  https://images-na.ssl-images-amazon.com/images...   
1638  1004  https://images-na.ssl-images-amazon.com/images...   
1639  1306  https://images-na.ssl-images-amazon.com/images...   

                                            Review Text  \
0     Loving these vintage springs on my vintage str...   
1     Works great as a guitar bench mat. Not rugged ...   
2     Works great as a guitar bench mat. N

Text Feature Extraction

In [9]:
import pandas as pd
import numpy as np
import math
import pickle
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re
from collections import Counter

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for text preprocessing
def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()
    # Remove URLs, hashtags, and mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove punctuation and non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]
    # Stopwords removal, stemming, and lemmatization
    tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens if word not in stop_words]
    return tokens

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Assuming cleaned_df is your DataFrame
text_data = cleaned_df['Review Text'].fillna('').tolist()

# Preprocess text data
tokenized_texts = [preprocess_text(text) for text in text_data]

# Manual TF-IDF Calculation
def compute_tf_idf(tokenized_docs):
    # Count term frequencies using Counter
    tf = [{word: count / len(doc) for word, count in Counter(doc).items()} for doc in tokenized_docs]

    # Create a set of all unique words in all documents
    all_words = set(word for doc in tokenized_docs for word in doc)

    # Calculate document frequency (DF) using set operations
    df = {word: sum(1 for doc in tokenized_docs if word in doc) for word in all_words}

    # Calculate IDF (inverse document frequency)
    idf = {word: math.log(len(tokenized_docs) / freq) for word, freq in df.items()}

    # Calculate TF-IDF
    tf_idf = [{word: freq * idf[word] for word, freq in doc.items()} for doc in tf]
    return tf_idf

tf_idf_scores_text = compute_tf_idf(tokenized_texts)

# Specify paths for saving tokenized texts and TF-IDF scores
tokenized_texts_path = 'tokenized_texts_text.pkl'
tf_idf_scores_path = 'tf_idf_scores_manual_text.pkl'

# Save tokenized texts
with open(tokenized_texts_path, 'wb') as f:
    pickle.dump(tokenized_texts, f)

# Save TF-IDF scores
with open(tf_idf_scores_path, 'wb') as f:
    pickle.dump(tf_idf_scores_text, f)

# Load tokenized texts
with open('tokenized_texts_text.pkl', 'rb') as f:
    tokenized_texts = pickle.load(f)

# Load TF-IDF scores
with open('tf_idf_scores_manual_text.pkl', 'rb') as f:
    tf_idf_scores = pickle.load(f)

# Create a DataFrame from TF-IDF scores
tf_idf_pd = pd.DataFrame(tf_idf_scores)
tf_idf_pd.fillna(0, inplace=True)

# Display the DataFrame
print("TF-IDF DataFrame:")
print(tf_idf_pd)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TF-IDF DataFrame:
          love    vintag    spring     strat      good   tension     great  \
0     0.129524  0.520793  0.727539  0.200533  0.100905  0.327836  0.076447   
1     0.000000  0.000000  0.000000  0.000000  0.065808  0.000000  0.049857   
2     0.000000  0.000000  0.000000  0.000000  0.065808  0.000000  0.049857   
3     0.000000  0.000000  0.000000  0.000000  0.065808  0.000000  0.049857   
4     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
...        ...       ...       ...       ...       ...       ...       ...   
1635  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.156368   
1636  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1637  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1638  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.099713   
1639  0.000000  0.000000  0.000000  0.000000  0.151357  0.000000  0.000000   

        stabil     float     bridg  ...  yngw

In [7]:
tf_idf_pd

Unnamed: 0,love,vintag,spring,strat,good,tension,great,stabil,float,bridg,...,yngwie,neoclass,john,mayer,importantli,toneprint,biggi,accord,screenshot,piti
0,0.129524,0.520793,0.727539,0.200533,0.100905,0.327836,0.076447,0.333637,0.339991,0.223960,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.065808,0.000000,0.049857,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.065808,0.000000,0.049857,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.065808,0.000000,0.049857,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1635,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.156368,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.336475,0.000000,0.000000,0.000000,0.000000
1636,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.131741,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.131555,0.000000,0.000000,0.000000
1637,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.131741,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.131555,0.000000,0.000000,0.000000
1638,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.099713,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.321846,0.321846,0.321846


In [27]:
len(tokenized_texts)

1640

In [8]:
# Example input image URLs
import numpy as np

def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    magnitude_v1 = np.linalg.norm(v1)
    magnitude_v2 = np.linalg.norm(v2)

    # Avoid division by zero
    if magnitude_v1 == 0 or magnitude_v2 == 0:
        return 0

    cosine_sim = dot_product / (magnitude_v1 * magnitude_v2)
    return cosine_sim
image_urls = []
url_input = input("Enter image URL (or press Enter to finish): ").strip()
review_input = input("REVIEW: ").strip()
doc1 = preprocess_text(review_input)
tf = [{word: doc1.count(word) / len(doc1) for word in doc1}]
df = {}
for doc in tokenized_texts:
    for word in set(doc):
        df[word] = df.get(word, 0) + 1

# Calculate IDF (inverse document frequency)
idf = {word: math.log(len(tokenized_texts) / freq) for word, freq in df.items()}

# Calculate TF-IDF
tf_idf_doc1 = [{word: freq * idf[word] for word, freq in df.items()}]

if url_input.startswith("[") and url_input.endswith("]"):
    # Extract URLs from within square brackets
    url_input = url_input[1:-1]
    image_urls.extend([url.strip() for url in url_input.split(",")])
elif url_input:
    # If a single URL is provided without square brackets
    image_urls.append(url_input)


# Extract features from input images
query_image_vectors = [extract_image_features_resnet(url) for url in image_urls]
query_review_vector = pd.DataFrame(tf_idf_doc1)

# Calculate similarities for images
image_similarities = []
for i, feature in enumerate(image_features_resnet):
    similarities = []
    for query_image_vector in query_image_vectors:
        feature = feature.flatten()
        cosine_sim_im = cosine_similarity(query_image_vector, feature)
        similarities.append(cosine_sim_im)

    cosine_sim_rv = cosine_similarity(query_review_vector.iloc[0], tf_idf_pd.iloc[i])
    average_similarity = sum(similarities) / len(similarities)
    composite_similarity=(cosine_sim_rv+average_similarity)/2
    image_similarities.append((i, average_similarity, cosine_sim_rv, composite_similarity))

# Sort the list of similar images based on cosine similarity in descending order
image_similarities.sort(key=lambda x: x[1], reverse=True)
review_similarities = image_similarities.copy()
composite_similarities= review_similarities.copy()

review_similarities.sort(key=lambda x: x[2], reverse=True)
composite_similarities.sort(key=lambda x: x[3], reverse=True)

# Get top 3 similar images
top_3_similar_images = image_similarities[:3]
top_3_similar_reviews = review_similarities[:3]
top_3_similar_composites = composite_similarities[:3]

# Save top 3 similar images
with open('top_3_images.pkl', 'wb') as f:
    pickle.dump(top_3_similar_images, f)

with open('top_3_txt.pkl', 'wb') as f:
    pickle.dump(top_3_similar_reviews, f)

with open('top_3_composite.pkl', 'wb') as f:
    pickle.dump(top_3_similar_composites, f)

print("----------------------------------------------------")
print("Top 3 similar images:")
c1 = 1
for i, img_similarity, txt_similarity, composite_similarity in top_3_similar_images:
    print(f"{c1}. Image URL: {cleaned_df['Image'][i]}")
    print(f"Review: {cleaned_df['Review Text'][i]}")
    print(f"Cosine score image: {img_similarity}")
    print(f"Cosine score text: {txt_similarity}")
    print(f"Cosine score composite: {composite_similarity}")
    print("\n")
    c1 += 1

print("----------------------------------------------------")
print("Top 3 similar reviews:")
c2 = 1
for i, img_similarity, txt_similarity ,composite_similarity in top_3_similar_reviews:
    print(f"{c2}. Image URL: {cleaned_df['Image'][i]}")
    print(f"Review: {cleaned_df['Review Text'][i]}")
    print(f"Cosine score image: {img_similarity}")
    print(f"Cosine score text: {txt_similarity}")
    print(f"Cosine score composite: {composite_similarity}")
    print("\n")
    c2 += 1

print("----------------------------------------------------")
print("Top 3 similar composites:")
c3 = 1
for i, img_similarity, txt_similarity, composite_similarity in top_3_similar_composites:
    print(f"{c3}. Image URL: {cleaned_df['Image'][i]}")
    print(f"Review: {cleaned_df['Review Text'][i]}")
    print(f"Cosine score image: {img_similarity}")
    print(f"Cosine score text: {txt_similarity}")
    print(f"Cosine score composite: {composite_similarity}")
    print("\n")
    c3 += 1


Enter image URL (or press Enter to finish): https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg
REVIEW: I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.
----------------------------------------------------
Top 3 similar images:
1. Image URL: https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg
Review: I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.
Cosine score image: 1.0000001192092896
Cosine score text: 0.07903665389491232
Cosine score composite: 0.5395183865521009


2. Image URL: https://images-na.ssl-images-amazon.com/images/I/719-SDMiOoL._SY88.jpg
Review: These locking tuners look great and keep tune.  Good quality materials and construction.  Excellent upgrade to any guitar.  I had to drill 