# Image Feature Extraction

In [6]:
import requests
from io import BytesIO
from PIL import Image

def download_image(url):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        return img
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None


In [7]:
from PIL import ImageOps, ImageEnhance
import numpy as np

def preprocess_image(img, target_size=(224, 224)):
    img = img.resize(target_size)

    if np.random.rand() > 0.5:
        img = ImageOps.mirror(img)

    enhancer = ImageEnhance.Contrast(img)
    img = enhancer.enhance(1.5)  # Adjust the factor to change contrast
    enhancer = ImageEnhance.Brightness(img)
    img = enhancer.enhance(1.2)  # Adjust the factor to change brightness

    return img


In [8]:
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.models import Model

def extract_features(img, model):
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    features = model.predict(img)
    return features


In [9]:
from sklearn.preprocessing import normalize

def normalize_features(features):
    normalized_features = normalize(features)
    return normalized_features


In [10]:
from keras.applications import ResNet50

import pandas as pd
import pickle

extracted_features = {}
url_mapping= {}
url_ind_map= {}
img_ind_map= {}

file_path = 'A2_Data.csv'
data = pd.read_csv(file_path)

base_model = ResNet50(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

row_num= -1
ind_num= 0
img_num= 0
# flag= 0
img_list= []
faulty_urls= []

for index, row in data.iterrows():
    # if flag==5:
    #     break

    image_urls= eval(row['Image'])
    url_list= []
    url_feature_list= []

    row_num+= 1

    for url in image_urls:
        # print(url)
        img = download_image(url)
        if img==None:
            faulty_urls.append(url)
            continue
        else:
            if img:
                processed_img= preprocess_image(img)
                features= extract_features(processed_img, model)
                normalized_features= normalize_features(features)

                url_list.append(url)
                url_feature_list.append(normalized_features.flatten())

                img_ind_map[url]= img_num
                img_num+= 1
                img_list.append(normalized_features.flatten())

    if len(url_list)!=0:
        url_mapping[data.iloc[row_num][0]]= url_list
        extracted_features[data.iloc[row_num][0]]= url_feature_list
        url_ind_map[data.iloc[row_num][0]]= ind_num
    # flag+= 1
    ind_num+= 1

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5
Error downloading https://images-na.ssl-images-amazon.com/images/I/71F3npeHUDL._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x7d40b87327a0>
Error downloading https://images-na.ssl-images-amazon.com/images/I/71wHUWncMGL._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x7d40c0f5b100>
Error downloading https://images-na.ssl-images-amazon.com/images/I/71B8OOE5N8L._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x7d406b9f6a70>
Error downloading https://images-na.ssl-images-amazon.com/images/I/81SX3oAWbNL._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x7d4054145e90>
Error downloading https://images-na.ssl-images-amazon.com/images/I/718niQ1GEwL._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x7d404cf91df0>
Error downloading https://images-na.ssl-images-amazon.com/images/I/61OboZT-kcL._SY8

In [11]:
with open('extracted_features.pkl', 'wb') as f:
    pickle.dump(extracted_features, f)

In [12]:
with open('extracted_features.pkl', 'rb') as f:
    extracted_features= pickle.load(f)

In [13]:
with open('image_index.pkl', 'wb') as f:
    pickle.dump(url_ind_map, f)

In [14]:
with open('image_index.pkl', 'rb') as f:
    url_ind_map= pickle.load(f)

In [15]:
with open('img_ind_map.pkl', 'wb') as f:
    pickle.dump(img_ind_map, f)

In [16]:
with open('img_ind_map.pkl', 'rb') as f:
    img_ind_map= pickle.load(f)

In [17]:
faulty_urls

['https://images-na.ssl-images-amazon.com/images/I/71F3npeHUDL._SY88.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/71wHUWncMGL._SY88.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/71B8OOE5N8L._SY88.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/81SX3oAWbNL._SY88.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/718niQ1GEwL._SY88.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/61OboZT-kcL._SY88.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/710a2Pyh5lL._SY88.jpg',
 'https://images-na.ssl-images-amazon.com/images/I/816NMd0LexL._SY88.jpg']

# Text Feature Extraction

In [117]:
pip install nltk scikit-learn



In [118]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import string

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    if type(text)!=str:
        return ""
    text = text.lower()

    tokens = word_tokenize(text)

    tokens = [word for word in tokens if word not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

file_path = 'A2_Data.csv'
data = pd.read_csv(file_path)

review_mapping= {}
review_ind_map= {}

# row_number= 0
# for index, row in data.iterrows():
#     review= row['Review Text']
#     # print(row_number)
#     # print(review, type(review))
#     if type(review)!=str:
#       review= ""
#     processed_review= preprocess_text(review)
#     extracted_text[row_number]= processed_review
#     row_number+= 1

data['Processed_Review'] = data['Review Text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [119]:
# row_number= 0
# for i in data['Processed_Review']:
#     if row_number not in extracted_features.keys():
#         data['Processed_Review'].drop(row_number)
#     row_number+= 1

In [120]:
from collections import Counter
import math

def compute_tf(text):
    tf_text= Counter(text)
    tf_dict= {}
    for word in tf_text:
        tf_text[word]= tf_text[word] / float(len(text))
    return tf_text

def compute_idf(doc_list):
    idf_dict = {}
    N = len(doc_list)

    idf_dict = dict.fromkeys(doc_list[0].keys(), 0)
    for doc in doc_list:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1

    for word, val in idf_dict.items():
        idf_dict[word] = math.log(N / float(val))

    return idf_dict

tf_bow = [compute_tf(doc.split()) for doc in data['Processed_Review']]

word_set = set().union(*[set(doc.split()) for doc in data['Processed_Review']])

idfs = compute_idf([dict.fromkeys(word_set, 0)] + tf_bow)

def compute_tf_idf(tf_bow, idfs):
    tfidf = {}
    for word, val in tf_bow.items():
        tfidf[word] = val * idfs[word]
    return tfidf

tfidf_bow = [compute_tf_idf(doc, idfs) for doc in tf_bow]

# print(type(tfidf_bow[0]))
# print(tfidf_bow[0])

ind_num= 0
for i in range(len(tfidf_bow)):
    review_id= data.iloc[i][0]
    if review_id not in extracted_features.keys():
        continue
    else:
        review_mapping[review_id]= tfidf_bow[i]
        review_ind_map[review_id]= ind_num
        ind_num+= 1

In [122]:
with open('review_mapping.pkl', 'wb') as f:
    pickle.dump(review_mapping, f)

In [123]:
with open('review_mapping.pkl', 'rb') as f:
    review_mapping= pickle.load(f)

In [124]:
with open('review_index.pkl', 'wb') as f:
    pickle.dump(review_ind_map, f)

In [125]:
with open('review_index.pkl', 'rb') as f:
    review_ind_map= pickle.load(f)

In [126]:
review_mapping[2078]

{'great': 0.09503614862678735,
 'price': 0.15384664552402114,
 'good': 0.13173988421046995,
 'qualiti': 0.15817662376824704,
 "n't": 0.09072265408023823,
 'quit': 0.2869182397096245,
 'match': 0.30107649277590764,
 'radiu': 0.48417854088725926,
 'sound': 0.12291939798008174,
 'hole': 0.23880862595672253,
 'close': 0.29512157911072884,
 'enough': 0.21160891057068323}

# Image Retrieval and Text Retrieval

In [127]:
import numpy as np
from numpy.linalg import norm

def cosine_similarity_image(va, vb):
    if norm(va)==0 or norm(vb)==0:
        return 0
    return np.dot(va, vb) / (norm(va) * norm(vb))

def cosine_similarity_text(text1, text2):
    num= 0
    for k, v in text1.items():
        # print(k, v)
        if k in text2.keys():
            num+= v*text2[k]

    a= text_cosine_helper(text1)
    b= text_cosine_helper(text2)
    return num/(a*b)

def text_cosine_helper(text):
    val= 0
    for k, v in text.items():
        val+= v*v
    # val+= 1
    if val==0:
        return 1

    return val**0.5

In [128]:
text_cosine_scores= []

for k1, v1 in review_mapping.items():
    temp= []
    for k2, v2 in review_mapping.items():
        # if k1>k2:
        #     temp.append(text_cosine_scores[j][i])
        # else:
            # print(type(tfidf_bow[i]))
            # print(tfidf_bow[i])
            temp.append(round(cosine_similarity_text(v1, v2), 3))
    text_cosine_scores.append(temp)

In [129]:
with open('text_cosine_scores.pkl', 'wb') as f:
    pickle.dump(text_cosine_scores, f)

In [130]:
with open('text_cosine_scores.pkl', 'rb') as f:
    text_cosine_scores= pickle.load(f)

In [131]:
image_cosine_scores= []

for i in img_list:
    temp= []
    for k2, v2 in extracted_features.items():
        score= 0
        count= 0
        # print(k2, len(v2))
        for j in v2:
            score+= cosine_similarity_image(i, j)
            count+= 1
        # print(count)
        temp.append(round(score/count, 4))
    image_cosine_scores.append(temp)

In [132]:
with open('image_cosine_scores.pkl', 'wb') as f:
    pickle.dump(image_cosine_scores, f)

In [133]:
with open('image_cosine_scores.pkl', 'rb') as f:
    image_cosine_scores= pickle.load(f)

In [134]:
# print(text_cosine_scores)
k= 0
for i in text_cosine_scores:
    if k>5:
        break
    for j in i:
        print(j, end=' ')
    print()
    k+= 1

1.0 0.011 0.0 0.022 0.012 0.0 0.0 0.01 0.004 0.0 0.0 0.0 0.028 0.0 0.0 0.015 0.0 0.003 0.009 0.0 0.007 0.0 0.041 0.014 0.08 0.006 0.0 0.007 0.009 0.007 0.011 0.0 0.004 0.0 0.0 0.008 0.009 0.005 0.007 0.006 0.0 0.014 0.038 0.019 0.0 0.025 0.007 0.008 0.016 0.0 0.044 0.0 0.0 0.0 0.006 0.0 0.019 0.013 0.0 0.0 0.04 0.006 0.0 0.011 0.005 0.018 0.008 0.025 0.012 0.01 0.012 0.013 0.01 0.0 0.0 0.016 0.013 0.055 0.0 0.0 0.0 0.009 0.001 0.019 0.013 0.0 0.0 0.013 0.065 0.016 0.0 0.0 0.0 0.0 0.006 0.0 0.014 0.002 0.006 0.002 0.006 0.013 0.008 0.001 0.011 0.0 0.024 0.005 0.012 0.0 0.0 0.029 0.04 0.083 0.033 0.004 0.018 0.003 0.09 0.015 0.0 0.015 0.0 0.0 0.003 0.0 0.007 0.0 0.013 0.004 0.007 0.003 0.0 0.038 0.015 0.007 0.077 0.017 0.006 0.0 0.049 0.026 0.0 0.011 0.014 0.008 0.03 0.068 0.0 0.023 0.0 0.005 0.004 0.01 0.027 0.003 0.009 0.01 0.0 0.01 0.034 0.023 0.0 0.004 0.047 0.019 0.0 0.0 0.021 0.012 0.0 0.005 0.011 0.008 0.008 0.015 0.01 0.0 0.0 0.0 0.005 0.008 0.0 0.0 0.007 0.003 0.012 0.005 0.01 0

In [135]:
# print(image_cosine_scores)

k= 0
for i in image_cosine_scores:
    if k>5:
        break
    for j in i:
        print(j, end=' ')
    print()
    k+= 1

1.0 0.6005 0.5209 0.6699 0.4962 0.3463 0.4331 0.5725 0.4416 0.54 0.4689 0.3736 0.4288 0.3937 0.5338 0.4318 0.5221 0.4588 0.5853 0.3642 0.5735 0.349 0.5427 0.4468 0.6512 0.6464 0.529 0.5145 0.4398 0.4311 0.5608 0.5306 0.5598 0.6006 0.4981 0.2938 0.5474 0.555 0.4548 0.5182 0.4231 0.5925 0.3127 0.5266 0.4399 0.5677 0.5177 0.5503 0.4759 0.5095 0.3636 0.5197 0.65 0.5971 0.5348 0.4794 0.4774 0.628 0.5332 0.5622 0.5828 0.5915 0.7268 0.4907 0.5835 0.3955 0.5196 0.5283 0.4963 0.6053 0.5318 0.5624 0.1923 0.5426 0.5039 0.5684 0.3898 0.404 0.3591 0.5832 0.4479 0.4992 0.4952 0.5476 0.559 0.6671 0.4011 0.5017 0.4332 0.4945 0.5486 0.5777 0.4809 0.4817 0.4458 0.5281 0.4742 0.4984 0.4392 0.5003 0.3886 0.5192 0.5818 0.52 0.6422 0.5348 0.4849 0.4363 0.527 0.4806 0.5191 0.5195 0.383 0.4521 0.562 0.5149 0.5051 0.5183 0.5245 0.5833 0.3681 0.5703 0.587 0.5046 0.5919 0.3646 0.3578 0.5267 0.4281 0.5122 0.3827 0.3191 0.5164 0.4575 0.4131 0.5471 0.5601 0.6284 0.4291 0.5121 0.4651 0.4831 0.3758 0.4617 0.4729 0.51

In [136]:
print(len(text_cosine_scores), len(text_cosine_scores[0]))

994 994


In [137]:
print(len(image_cosine_scores), len(image_cosine_scores[0]))

1640 994


In [138]:
image_url= input("Enter the url for image: ")
review_text= input("Enter the corresponding review:")

In [139]:
# # processed_img= preprocess_image(image)
# # features= extract_features(processed_img, model)
# # nf= normalize_features(features)

# # # TF calculation
# # preprocessed_text= preprocess_text(review)
# # tf_review= compute_tf(preprocessed_text.split())
# image_url= "https://images-na.ssl-images-amazon.com/images/I/71dVsYejzTL._SY88.jpg"

# review_text= """Works great as a guitar bench mat. Not rugged enough for abuse but if you take care of it, it will take care of you. Makes organization of workspace much easier because screws won't roll around. Color is good too."""

In [141]:
review_id= -1
for i in range(data.shape[0]):
    if image_url in data.iloc[i][1] and review_text==data.iloc[i][2]:
        review_id= data.iloc[i][0]
        break

In [142]:
print(review_id)

1205


In [143]:
img_ind= img_ind_map[image_url]
text_ind= review_ind_map[review_id]

In [144]:
img_ind

2

In [145]:
text_ind

1

In [146]:
text_cosine_scores= np.array(text_cosine_scores)
image_cosine_scores= np.array(image_cosine_scores)

In [147]:
def top_indices_specific_row(matrix, row_index):
    row = matrix[row_index]
    indices = np.argsort(row)[-4:]
    return indices

In [148]:
top_image= top_indices_specific_row(image_cosine_scores, img_ind)
top_texts= top_indices_specific_row(text_cosine_scores, text_ind)

In [149]:
top_texts

array([815, 421,  60,   1])

In [151]:
top_image

array([818, 849, 978,   1])

## Image Retrieval

In [152]:
def helper_print(rid, flag):
    for i in range(data.shape[0]):
        if data.iloc[i][0]==rid:
            return data.iloc[i][flag]

In [153]:
def rid_helper(map, val):
    for k, v in map.items():
        if v==val:
            return k

In [154]:
def print_helper(sorted_rtr):
    for i in range(len(sorted_rtr)):
        print("Image URL: ")
        print(sorted_rtr[i][0])
        print("Review: ")
        print(sorted_rtr[i][1])
        print("Cosine similarity of images - ")
        print(sorted_rtr[i][2])
        print("Cosine similarity of text - ")
        print(sorted_rtr[i][3])
        print("Composite similarity score: ")
        print(sorted_rtr[i][4])
        print()

In [155]:
img_rtr_data= []

for i in range(4):
    temp_data= []
    t_image_ind= top_image[3-i]
    rid= rid_helper(url_ind_map, t_image_ind)
    # t_image_ind= url_ind_map[rid]

    t_text_ind= review_ind_map[rid]
    temp_data.append(helper_print(rid, 1))
    temp_data.append(helper_print(rid, 2))

    img_score= image_cosine_scores[img_ind][t_image_ind]
    temp_data.append(img_score)
    txt_score= text_cosine_scores[text_ind][t_text_ind]
    temp_data.append(txt_score)

    cmp_score= (img_score+txt_score)/2
    temp_data.append(round(cmp_score, 4))

    img_rtr_data.append(temp_data)

In [156]:
sorted_img_rtr = sorted(img_rtr_data, key=lambda x: x[4], reverse=True)

print("USING IMAGE RETRIEVAL")
print()
print_helper(sorted_img_rtr)

USING IMAGE RETRIEVAL

Image URL: 
['https://images-na.ssl-images-amazon.com/images/I/71HSx4Y-5dL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71dVsYejzTL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71domStNfIL._SY88.jpg']
Review: 
Works great as a guitar bench mat. Not rugged enough for abuse but if you take care of it, it will take care of you. Makes organization of workspace much easier because screws won't roll around. Color is good too.
Cosine similarity of images - 
0.7669
Cosine similarity of text - 
1.0
Composite similarity score: 
0.8834

Image URL: 
['https://images-na.ssl-images-amazon.com/images/I/71vdSMvWXFL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/719frGS8o7L._SY88.jpg']
Review: 
This is great sounding at a great price! Is it perfect? No, but it is no less perfect than something costing a lot more and not sounding as good as this pedal. I read one review in which the guy was not satisfied because he said it sounded l

In [157]:
txt_rtr_data= []

for i in range(4):
    temp_data= []
    t_text_ind= top_texts[3-i]
    rid= rid_helper(review_ind_map, t_text_ind)
    t_image_ind= url_ind_map[rid]
    # t_text_ind= review_ind_map[rid]

    t_text_ind= review_ind_map[rid]
    temp_data.append(helper_print(rid, 1))
    temp_data.append(helper_print(rid, 2))

    img_score= image_cosine_scores[img_ind][t_image_ind]
    temp_data.append(img_score)
    txt_score= text_cosine_scores[text_ind][t_text_ind]
    temp_data.append(txt_score)

    cmp_score= (img_score+txt_score)/2
    temp_data.append(round(cmp_score, 4))

    txt_rtr_data.append(temp_data)

In [158]:
sorted_txt_rtr = sorted(txt_rtr_data, key=lambda x: x[4], reverse=True)

print("USING TEXT RETRIEVAL")
print()
print_helper(sorted_txt_rtr)

USING TEXT RETRIEVAL

Image URL: 
['https://images-na.ssl-images-amazon.com/images/I/71HSx4Y-5dL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71dVsYejzTL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71domStNfIL._SY88.jpg']
Review: 
Works great as a guitar bench mat. Not rugged enough for abuse but if you take care of it, it will take care of you. Makes organization of workspace much easier because screws won't roll around. Color is good too.
Cosine similarity of images - 
0.7669
Cosine similarity of text - 
1.0
Composite similarity score: 
0.8834

Image URL: 
['https://images-na.ssl-images-amazon.com/images/I/71vT2-nW7-L._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71Wz9iild8L._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71Z5NiWe6TL._SY88.jpg']
Review: 
The picture doesn't do this bridge justice, I was skeptical of the quality when I ordered this Tremolo bridge but was pleasantly surprised when it arrived. I wasn't so 