In [2]:
!pip install -U scikit-learn
!pip install scikit-image
!pip install tqdm



In [3]:
import numpy as np
import pandas as pd
import cv2
from skimage import exposure
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from tqdm import tqdm
import urllib.request


In [4]:

# Step 1: Image Preprocessing
def preprocess_image(img):
    # Altering contrast
    img = exposure.adjust_gamma(img, gamma=1.5)
    # Resizing
    img = cv2.resize(img, (224, 224))
    # Random flips
    if np.random.rand() < 0.5:
        img = cv2.flip(img, 1)  # Horizontal flip
    # Brightness and exposure adjustments (optional)
    # img = cv2.convertScaleAbs(img, alpha=1.2, beta=10)  # Example of brightness adjustment
    return img

In [63]:
# Step 2: Selecting a Pre-trained CNN
base_model = models.resnet50(pretrained=True)
base_model = nn.Sequential(*list(base_model.children())[:-1])
base_model.eval()



Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [64]:
# Step 3: Feature Extraction
def extract_features(img_path, model):
    
    try:

        req = urllib.request.urlopen(img_path)
        arr = np.asarray(bytearray(req.read()), dtype=np.uint8)
        img = cv2.imdecode(arr, -1) # 'Load it as it is'

        # img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
        img = preprocess_image(img)
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        img = transform(img)
        img = img.unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            features = model(img)
        features = features.squeeze().numpy()
        return features
    
    except:
        return None

In [65]:
dataset = pd.read_csv('A2_Data.csv')


In [66]:
dataset

Unnamed: 0.1,Unnamed: 0,Image,Review Text
0,3452,['https://images-na.ssl-images-amazon.com/imag...,Loving these vintage springs on my vintage str...
1,1205,['https://images-na.ssl-images-amazon.com/imag...,Works great as a guitar bench mat. Not rugged ...
2,1708,['https://images-na.ssl-images-amazon.com/imag...,We use these for everything from our acoustic ...
3,2078,['https://images-na.ssl-images-amazon.com/imag...,Great price and good quality. It didn't quite...
4,801,['https://images-na.ssl-images-amazon.com/imag...,I bought this bass to split time as my primary...
...,...,...,...
995,1265,['https://images-na.ssl-images-amazon.com/imag...,Extremely impressed with this kit.
996,1882,['https://images-na.ssl-images-amazon.com/imag...,This is a great stereo reverb with plenty of c...
997,1547,['https://images-na.ssl-images-amazon.com/imag...,I really like the simplicity of this bridge. I...
998,1004,['https://images-na.ssl-images-amazon.com/imag...,"Great Product, but there is no warranty in the..."


In [67]:
index_to_number = dict()

for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
    image_index = row[0]
    index_to_number[index] = image_index


  image_index = row[0]
100%|██████████| 1000/1000 [00:00<00:00, 25246.51it/s]


In [68]:
index_to_image = dict()

image_index = 0
for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
    img_path_1 = row['Image']
    image_path_str = img_path_1.strip('[]')
    image_paths_list = image_path_str.split(', ')
    image_paths_list = [path.strip("'") for path in image_paths_list]

    index_to_image[image_index] = []
    for i in image_paths_list:
        index_to_image[image_index].append(i)

    image_index +=1
    


100%|██████████| 1000/1000 [00:00<00:00, 38508.82it/s]


In [69]:
index_to_image

{0: ['https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg'],
 1: ['https://images-na.ssl-images-amazon.com/images/I/71HSx4Y-5dL._SY88.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/71dVsYejzTL._SY88.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/71domStNfIL._SY88.jpg'],
 2: ['https://images-na.ssl-images-amazon.com/images/I/71Md5ihUFLL._SY88.jpg'],
 3: ['https://images-na.ssl-images-amazon.com/images/I/71Isri9SEaL._SY88.jpg'],
 4: ['https://images-na.ssl-images-amazon.com/images/I/71w8aOdrTuL._SY88.jpg'],
 5: ['https://images-na.ssl-images-amazon.com/images/I/81dxkALs4CL._SY88.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/81VixyruzDL._SY88.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/51IWGFH3IPL._SY88.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/81p1k-2194L._SY88.jpg'],
 6: ['https://images-na.ssl-images-amazon.com/images/I/71cS64LddWL._SY88.jpg'],
 7: ['https://images-na.ssl-images-amazon.com/images/I/71z9b

In [70]:
index_to_review = dict()


for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
    image_index = row[2]
    index_to_review[index] = image_index


  image_index = row[2]
100%|██████████| 1000/1000 [00:00<00:00, 21923.66it/s]


In [71]:
index_to_review

{0: 'Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.',
 1: "Works great as a guitar bench mat. Not rugged enough for abuse but if you take care of it, it will take care of you. Makes organization of workspace much easier because screws won't roll around. Color is good too.",
 2: "We use these for everything from our acoustic bass down to our ukuleles. I know there is a smaller model available for ukes, violins, etc.; we haven't yet ordered those, but these will work on smaller instruments if one doesn't extend the feet to their maximum width. They're gentle on the instruments, and the grippy material keeps them secure.\n\nThe greatest benefit has been when writing music at the computer and needing to set a guitar down to use the keyboard/mouse - just easier for me than a hanging stand.\n\nWe have several and gave one to a friend for Christm

In [72]:
index_to_number

{0: 3452,
 1: 1205,
 2: 1708,
 3: 2078,
 4: 801,
 5: 126,
 6: 1329,
 7: 325,
 8: 245,
 9: 1714,
 10: 1743,
 11: 3710,
 12: 1664,
 13: 394,
 14: 1819,
 15: 672,
 16: 2740,
 17: 2836,
 18: 2453,
 19: 364,
 20: 818,
 21: 3705,
 22: 1890,
 23: 1572,
 24: 649,
 25: 3023,
 26: 527,
 27: 1039,
 28: 2543,
 29: 1191,
 30: 590,
 31: 2357,
 32: 668,
 33: 1648,
 34: 469,
 35: 2331,
 36: 464,
 37: 61,
 38: 2724,
 39: 1686,
 40: 1968,
 41: 526,
 42: 1211,
 43: 3370,
 44: 1713,
 45: 584,
 46: 3609,
 47: 529,
 48: 3255,
 49: 288,
 50: 262,
 51: 3056,
 52: 1953,
 53: 1347,
 54: 2894,
 55: 66,
 56: 2380,
 57: 1538,
 58: 140,
 59: 1108,
 60: 3828,
 61: 3689,
 62: 1606,
 63: 1734,
 64: 2112,
 65: 2401,
 66: 3691,
 67: 2235,
 68: 2493,
 69: 3290,
 70: 2008,
 71: 2829,
 72: 1626,
 73: 5,
 74: 1336,
 75: 1817,
 76: 3506,
 77: 7,
 78: 3044,
 79: 2572,
 80: 3593,
 81: 2497,
 82: 3801,
 83: 1813,
 84: 1540,
 85: 1206,
 86: 3156,
 87: 429,
 88: 2739,
 89: 3830,
 90: 1543,
 91: 744,
 92: 1402,
 93: 1011,
 94: 311

In [73]:

# Preprocess images and extract features
new_image_features = []
image_features = dict()
for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
    image_index = row[0]
    # print(image_index)
    img_path_1 = row['Image']
    image_path_str = img_path_1.strip('[]')
    image_paths_list = image_path_str.split(', ')
    image_paths_list = [path.strip("'") for path in image_paths_list]
    
    # print(image_paths_list)
    image_features[image_index] = []
    for img_path in image_paths_list:
        try:
            features = extract_features(img_path, base_model)
            if features.shape == (2048,):
                image_features[image_index].append(features)
                new_image_features.append(features)
                # print(image_features)
                # print(new_image_features)
        except Exception as e:
            continue




  image_index = row[0]
100%|██████████| 1000/1000 [17:01<00:00,  1.02s/it]


In [30]:
image_features

{3452: [array([0.27560937, 0.60194117, 0.754934  , ..., 0.16488846, 0.60872906,
         0.22635223], dtype=float32)],
 1205: [array([0.69648486, 0.4313286 , 0.68877006, ..., 0.08553925, 0.3681998 ,
         0.47034553], dtype=float32),
  array([0.75058746, 0.18958028, 0.12760516, ..., 0.10283512, 0.84645057,
         0.04751762], dtype=float32),
  array([0.5142772 , 0.51244044, 0.47707045, ..., 0.5209865 , 0.8395843 ,
         0.35326934], dtype=float32)],
 1708: [array([0.38012242, 0.02712757, 0.6850916 , ..., 0.11624838, 0.6173863 ,
         0.16000004], dtype=float32)],
 2078: [array([0.31159744, 0.4188953 , 0.5229862 , ..., 0.07281115, 0.8788534 ,
         0.39201817], dtype=float32)],
 801: [array([0.5459123 , 0.7595159 , 1.0664445 , ..., 0.18190739, 0.5383692 ,
         0.12744005], dtype=float32)],
 126: [array([1.7978957 , 0.32043925, 1.3665035 , ..., 0.14168923, 0.1333295 ,
         0.3691537 ], dtype=float32),
  array([1.5082341 , 0.5774796 , 0.7863694 , ..., 0.16180122, 0.0

In [31]:
new_image_features

[array([0.27560937, 0.60194117, 0.754934  , ..., 0.16488846, 0.60872906,
        0.22635223], dtype=float32),
 array([0.69648486, 0.4313286 , 0.68877006, ..., 0.08553925, 0.3681998 ,
        0.47034553], dtype=float32),
 array([0.75058746, 0.18958028, 0.12760516, ..., 0.10283512, 0.84645057,
        0.04751762], dtype=float32),
 array([0.5142772 , 0.51244044, 0.47707045, ..., 0.5209865 , 0.8395843 ,
        0.35326934], dtype=float32),
 array([0.38012242, 0.02712757, 0.6850916 , ..., 0.11624838, 0.6173863 ,
        0.16000004], dtype=float32),
 array([0.31159744, 0.4188953 , 0.5229862 , ..., 0.07281115, 0.8788534 ,
        0.39201817], dtype=float32),
 array([0.5459123 , 0.7595159 , 1.0664445 , ..., 0.18190739, 0.5383692 ,
        0.12744005], dtype=float32),
 array([1.7978957 , 0.32043925, 1.3665035 , ..., 0.14168923, 0.1333295 ,
        0.3691537 ], dtype=float32),
 array([1.5082341 , 0.5774796 , 0.7863694 , ..., 0.16180122, 0.02562088,
        0.0155558 ], dtype=float32),
 array([2.

In [32]:
extracted_features = np.array(new_image_features)
mean = np.mean(extracted_features, axis=0)
std = np.std(extracted_features, axis=0)
# normalized_features = (extracted_features - mean) / std

In [33]:
print(mean)

[0.4736042  0.39685947 0.3912752  ... 0.24563675 0.51769507 0.3323246 ]


In [34]:
print(std)

[0.32994372 0.3464858  0.29241243 ... 0.21378873 0.45305625 0.26721624]


In [37]:
normalized_features_dict = dict()
for i in image_features:
    normalized_features_dict[i] = []
    for j in image_features[i]:
        j = np.array(j)
        print(j)
        normalized_features_dict[i].append((j - mean) / std)

[0.27560937 0.60194117 0.754934   ... 0.16488846 0.60872906 0.22635223]
[0.69648486 0.4313286  0.68877006 ... 0.08553925 0.3681998  0.47034553]
[0.75058746 0.18958028 0.12760516 ... 0.10283512 0.84645057 0.04751762]
[0.5142772  0.51244044 0.47707045 ... 0.5209865  0.8395843  0.35326934]
[0.38012242 0.02712757 0.6850916  ... 0.11624838 0.6173863  0.16000004]
[0.31159744 0.4188953  0.5229862  ... 0.07281115 0.8788534  0.39201817]
[0.5459123  0.7595159  1.0664445  ... 0.18190739 0.5383692  0.12744005]
[1.7978957  0.32043925 1.3665035  ... 0.14168923 0.1333295  0.3691537 ]
[1.5082341  0.5774796  0.7863694  ... 0.16180122 0.02562088 0.0155558 ]
[2.0794737  0.55688345 0.78360575 ... 0.40662152 0.07597207 0.10914455]
[1.1646376  0.2654516  0.7812614  ... 0.00357867 0.06182662 0.01296882]
[0.7798044  0.01953565 0.82690346 ... 0.36494425 0.90894556 0.14576328]
[1.0878496  1.1471065  0.83133113 ... 0.3851972  0.5177554  0.3015761 ]
[0.6001371  1.3162329  0.37268904 ... 0.34675905 0.11200014 0.29

In [36]:
normalized_features_dict

{3452: [array([-0.6000867 ,  0.59189063,  1.2436503 , ..., -0.37770134,
          0.2009331 , -0.39657906], dtype=float32)],
 1205: [array([ 0.6755112 ,  0.09948208,  1.0173811 , ..., -0.74885845,
         -0.32997066,  0.516514  ], dtype=float32),
  array([ 0.8394864 , -0.59823287, -0.9017059 , ..., -0.66795677,
          0.72563946, -1.0658296 ], dtype=float32),
  array([0.12327259, 0.3335807 , 0.29340494, ..., 1.2879525 , 0.710484  ,
         0.07838126], dtype=float32)],
 1708: [array([-0.28332645, -1.0670911 ,  1.0048014 , ..., -0.605216  ,
          0.22004157, -0.64488804], dtype=float32)],
 2078: [array([-0.49101335,  0.0635981 ,  0.45042878, ..., -0.8083944 ,
          0.79715997,  0.22339052], dtype=float32)],
 801: [array([ 0.2191529 ,  1.0466703 ,  2.3089626 , ..., -0.29809505,
          0.04563254, -0.76673687], dtype=float32)],
 126: [array([ 4.013689  , -0.22055802,  3.3351123 , ..., -0.48621607,
         -0.8483837 ,  0.13782513], dtype=float32),
  array([ 3.135777  ,  

In [13]:
import pickle

In [39]:
# Save extracted features using pickle
with open('extracted_normalized_features.pkl', 'wb') as f:
    pickle.dump(normalized_features_dict, f)


In [14]:
# Load extracted features using pickle
with open('extracted_normalized_features.pkl', 'rb') as f:
    normalized_extracted_features_loaded = pickle.load(f)


In [15]:
normalized_extracted_features_loaded

{3452: [array([-0.6000867 ,  0.59189063,  1.2436503 , ..., -0.37770134,
          0.2009331 , -0.39657906], dtype=float32)],
 1205: [array([ 0.6755112 ,  0.09948208,  1.0173811 , ..., -0.74885845,
         -0.32997066,  0.516514  ], dtype=float32),
  array([ 0.8394864 , -0.59823287, -0.9017059 , ..., -0.66795677,
          0.72563946, -1.0658296 ], dtype=float32),
  array([0.12327259, 0.3335807 , 0.29340494, ..., 1.2879525 , 0.710484  ,
         0.07838126], dtype=float32)],
 1708: [array([-0.28332645, -1.0670911 ,  1.0048014 , ..., -0.605216  ,
          0.22004157, -0.64488804], dtype=float32)],
 2078: [array([-0.49101335,  0.0635981 ,  0.45042878, ..., -0.8083944 ,
          0.79715997,  0.22339052], dtype=float32)],
 801: [array([ 0.2191529 ,  1.0466703 ,  2.3089626 , ..., -0.29809505,
          0.04563254, -0.76673687], dtype=float32)],
 126: [array([ 4.013689  , -0.22055802,  3.3351123 , ..., -0.48621607,
         -0.8483837 ,  0.13782513], dtype=float32),
  array([ 3.135777  ,  

In [16]:
link_to_features = dict()


for i in index_to_image:
    if normalized_extracted_features_loaded[index_to_number[i]] != []:
        count=0
        for j in index_to_image[i]:
            print(index_to_number[i])
            print(normalized_extracted_features_loaded[index_to_number[i]][count])
            link_to_features[j] = normalized_extracted_features_loaded[index_to_number[i]][count]
            count+=1

        

3452
[-0.6000867   0.59189063  1.2436503  ... -0.37770134  0.2009331
 -0.39657906]
1205
[ 0.6755112   0.09948208  1.0173811  ... -0.74885845 -0.32997066
  0.516514  ]
1205
[ 0.8394864  -0.59823287 -0.9017059  ... -0.66795677  0.72563946
 -1.0658296 ]
1205
[0.12327259 0.3335807  0.29340494 ... 1.2879525  0.710484   0.07838126]
1708
[-0.28332645 -1.0670911   1.0048014  ... -0.605216    0.22004157
 -0.64488804]
2078
[-0.49101335  0.0635981   0.45042878 ... -0.8083944   0.79715997
  0.22339052]
801
[ 0.2191529   1.0466703   2.3089626  ... -0.29809505  0.04563254
 -0.76673687]
126
[ 4.013689   -0.22055802  3.3351123  ... -0.48621607 -0.8483837
  0.13782513]
126
[ 3.135777    0.5212916   1.3511539  ... -0.39214194 -1.0861217
 -1.18544   ]
126
[ 4.867101    0.46184862  1.3417027  ...  0.75300866 -0.9749849
 -0.8352039 ]
126
[ 2.0943978 -0.379259   1.3336854 ... -1.1322303 -1.0062072 -1.1951212]
1329
[ 0.9280377 -1.0890023  1.4897734 ...  0.5580626  0.8635804 -0.6981661]
325
[ 1.8616673e+00  2

In [17]:
link_to_features

{'https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg': array([-0.6000867 ,  0.59189063,  1.2436503 , ..., -0.37770134,
         0.2009331 , -0.39657906], dtype=float32),
 'https://images-na.ssl-images-amazon.com/images/I/71HSx4Y-5dL._SY88.jpg': array([ 0.6755112 ,  0.09948208,  1.0173811 , ..., -0.74885845,
        -0.32997066,  0.516514  ], dtype=float32),
 'https://images-na.ssl-images-amazon.com/images/I/71dVsYejzTL._SY88.jpg': array([ 0.8394864 , -0.59823287, -0.9017059 , ..., -0.66795677,
         0.72563946, -1.0658296 ], dtype=float32),
 'https://images-na.ssl-images-amazon.com/images/I/71domStNfIL._SY88.jpg': array([0.12327259, 0.3335807 , 0.29340494, ..., 1.2879525 , 0.710484  ,
        0.07838126], dtype=float32),
 'https://images-na.ssl-images-amazon.com/images/I/71Md5ihUFLL._SY88.jpg': array([-0.28332645, -1.0670911 ,  1.0048014 , ..., -0.605216  ,
         0.22004157, -0.64488804], dtype=float32),
 'https://images-na.ssl-images-amazon.com/images/I/71Isr

In [18]:
!pip install nltk



In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to C:\Users\Aishani
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aishani
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Aishani
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Aishani
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [22]:
def preprocess_text(text):
    # Lowercase the text

    text = str(text).lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove punctuations
    tokens = [token for token in tokens if token not in string.punctuation]

    # Remove blank space tokens
    tokens = [token for token in tokens if token.strip()]

    #stemming and lemmatization
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [porter.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    return lemmatized_tokens

In [23]:
count = 1
tokens_in_doc = []
total_tokens = set()
for i in dataset['Review Text']:
    print(count)
    print(preprocess_text(i))
    for j in preprocess_text(i):
        total_tokens.add(j)
    tokens_in_doc.append(preprocess_text(i))
    count += 1

1
['love', 'vintag', 'spring', 'vintag', 'strat', 'good', 'tension', 'great', 'stabil', 'float', 'bridg', 'want', 'spring', 'way', 'go']
2
['work', 'great', 'guitar', 'bench', 'mat', 'rug', 'enough', 'abus', 'take', 'care', 'take', 'care', 'make', 'organ', 'workspac', 'much', 'easier', 'screw', 'wo', "n't", 'roll', 'around', 'color', 'good']
3
['use', 'everyth', 'acoust', 'bass', 'ukulel', 'know', 'smaller', 'model', 'avail', 'uke', 'violin', 'etc', "n't", 'yet', 'order', 'work', 'smaller', 'instrument', 'one', "n't", 'extend', 'foot', 'maximum', 'width', "'re", 'gentl', 'instrument', 'grippi', 'materi', 'keep', 'secur', 'greatest', 'benefit', 'write', 'music', 'comput', 'need', 'set', 'guitar', 'use', 'keyboard/mous', 'easier', 'hang', 'stand', 'sever', 'gave', 'one', 'friend', 'christma', 'well', "'ve", 'use', 'mine', 'stage', 'fold', 'small', 'enough', 'fit', 'right', 'gig', 'bag']
4
['great', 'price', 'good', 'qualiti', "n't", 'quit', 'match', 'radiu', 'sound', 'hole', 'close', 'en

In [24]:
tokens_in_doc

[['love',
  'vintag',
  'spring',
  'vintag',
  'strat',
  'good',
  'tension',
  'great',
  'stabil',
  'float',
  'bridg',
  'want',
  'spring',
  'way',
  'go'],
 ['work',
  'great',
  'guitar',
  'bench',
  'mat',
  'rug',
  'enough',
  'abus',
  'take',
  'care',
  'take',
  'care',
  'make',
  'organ',
  'workspac',
  'much',
  'easier',
  'screw',
  'wo',
  "n't",
  'roll',
  'around',
  'color',
  'good'],
 ['use',
  'everyth',
  'acoust',
  'bass',
  'ukulel',
  'know',
  'smaller',
  'model',
  'avail',
  'uke',
  'violin',
  'etc',
  "n't",
  'yet',
  'order',
  'work',
  'smaller',
  'instrument',
  'one',
  "n't",
  'extend',
  'foot',
  'maximum',
  'width',
  "'re",
  'gentl',
  'instrument',
  'grippi',
  'materi',
  'keep',
  'secur',
  'greatest',
  'benefit',
  'write',
  'music',
  'comput',
  'need',
  'set',
  'guitar',
  'use',
  'keyboard/mous',
  'easier',
  'hang',
  'stand',
  'sever',
  'gave',
  'one',
  'friend',
  'christma',
  'well',
  "'ve",
  'use',
 

In [26]:
def tf(tokens_in_doc, total_tokens):
    tf_dict = {}
    for i in tokens_in_doc:
        for j in i:
            if j in tf_dict:
                tf_dict[j] += 1
            else:
                tf_dict[j] = 1
    for key in tf_dict:
        tf_dict[key] = tf_dict[key] / len(total_tokens)
    return tf_dict

In [44]:
tf(tokens_in_doc, total_tokens)

{'love': 0.02799285288862418,
 'vintag': 0.004367679174111574,
 'spring': 0.0019853087155052612,
 'strat': 0.01171332142148104,
 'good': 0.047845940043676795,
 'tension': 0.001786777843954735,
 'great': 0.08278737343656939,
 'stabil': 0.0015882469724042088,
 'float': 0.0005955926146515784,
 'bridg': 0.008933889219773675,
 'want': 0.026801667659321026,
 'way': 0.015485407980941036,
 'go': 0.026801667659321026,
 'work': 0.0547945205479452,
 'guitar': 0.095096287472702,
 'bench': 0.0003970617431010522,
 'mat': 0.0005955926146515784,
 'rug': 0.002183839587055787,
 'enough': 0.01687512408179472,
 'abus': 0.0005955926146515784,
 'take': 0.014691284494738933,
 'care': 0.006750049632717888,
 'make': 0.029382568989477865,
 'organ': 0.0011911852293031567,
 'workspac': 0.0001985308715505261,
 'much': 0.026206075044669448,
 'easier': 0.0039706174310105225,
 'screw': 0.02243398848520945,
 'wo': 0.006750049632717888,
 "n't": 0.09747865793130832,
 'roll': 0.0015882469724042088,
 'around': 0.014889815

In [27]:
def idf(tokens_in_doc, total_tokens):

    no_of_docs = len(tokens_in_doc)
    idf_dict = {}
    for i in total_tokens:
        count = 0
        for j in tokens_in_doc:
            if i in j:
                count += 1
        idf_dict[i] = np.log(no_of_docs/count)
    return idf_dict

In [46]:
idf(tokens_in_doc, total_tokens)

{'usag': 6.907755278982137,
 'nylon': 4.961845129926823,
 'earthquak': 6.907755278982137,
 'cheep': 6.907755278982137,
 'nan': 6.907755278982137,
 'ask': 4.509860006183766,
 'back-up': 6.907755278982137,
 'effector': 6.907755278982137,
 'wise': 5.809142990314028,
 'emu': 6.907755278982137,
 'goop': 6.907755278982137,
 'rectangl': 6.907755278982137,
 'sigh': 6.907755278982137,
 'long-term': 6.907755278982137,
 'configur': 5.298317366548036,
 'fond': 6.907755278982137,
 'addit': 4.017383521085972,
 'stronger': 6.907755278982137,
 '7/19/17': 6.907755278982137,
 'strong': 4.017383521085972,
 '.10': 6.907755278982137,
 'pocket': 4.509860006183766,
 'lamb': 6.907755278982137,
 'hb': 6.907755278982137,
 'panel': 5.809142990314028,
 'jack': 4.017383521085972,
 'freez': 6.907755278982137,
 'pentagram': 6.907755278982137,
 'boost': 5.115995809754082,
 'st-12': 6.907755278982137,
 'ground/bar': 6.907755278982137,
 'game': 5.809142990314028,
 'muggi': 6.907755278982137,
 'flat': 3.912023005428146,

In [28]:
#2b - TF-IDF

def tf_idf(tokens_in_doc, total_tokens):
    tf_dict = tf(tokens_in_doc, total_tokens)
    idf_dict = idf(tokens_in_doc, total_tokens)
    tf_idf_dict = {}
    for i in total_tokens:
        tf_idf_dict[i] = tf_dict[i] * idf_dict[i]
    return tf_idf_dict

In [29]:
tf_idf = tf_idf(tokens_in_doc, total_tokens)

In [30]:
tf_idf

{'everyon': 0.008416671890969478,
 'textur': 0.002467583124249431,
 'dug': 0.002467583124249431,
 'meteor': 0.0013714026759940711,
 'lm2596': 0.0013714026759940711,
 'rode': 0.006577082689532158,
 '5': 0.0323988556228174,
 'area': 0.010536863917079541,
 'news': 0.002467583124249431,
 'moveabl': 0.0013714026759940711,
 'reverend': 0.0013714026759940711,
 'bbe': 0.0013714026759940711,
 'benefit': 0.004384721793021438,
 'blast': 0.0034598826624860205,
 'mean': 0.010744157251182291,
 '.13': 0.0013714026759940711,
 'hidden': 0.019317583847979155,
 'unlik': 0.002467583124249431,
 'fx': 0.0034598826624860205,
 'wand': 0.0013714026759940711,
 'strang': 0.0034598826624860205,
 'tha': 0.0013714026759940711,
 'pride': 0.002467583124249431,
 'luck': 0.002467583124249431,
 '8-space': 0.0013714026759940711,
 'thursday': 0.0013714026759940711,
 'super': 0.02689635793237694,
 'practic': 0.024827407322194097,
 'honestli': 0.008416671890969478,
 'w/': 0.006919765324972041,
 'tascam': 0.00370137468637414

In [31]:
# Save extracted features using pickle
with open('tf_idf.pkl', 'wb') as f:
    pickle.dump(tf_idf, f)

In [25]:
with open('tf_idf.pkl', 'rb') as f:
    tf_idf_loaded = pickle.load(f)

In [26]:
tf_idf_loaded

{'everyon': 0.008416671890969478,
 'textur': 0.002467583124249431,
 'dug': 0.002467583124249431,
 'meteor': 0.0013714026759940711,
 'lm2596': 0.0013714026759940711,
 'rode': 0.006577082689532158,
 '5': 0.0323988556228174,
 'area': 0.010536863917079541,
 'news': 0.002467583124249431,
 'moveabl': 0.0013714026759940711,
 'reverend': 0.0013714026759940711,
 'bbe': 0.0013714026759940711,
 'benefit': 0.004384721793021438,
 'blast': 0.0034598826624860205,
 'mean': 0.010744157251182291,
 '.13': 0.0013714026759940711,
 'hidden': 0.019317583847979155,
 'unlik': 0.002467583124249431,
 'fx': 0.0034598826624860205,
 'wand': 0.0013714026759940711,
 'strang': 0.0034598826624860205,
 'tha': 0.0013714026759940711,
 'pride': 0.002467583124249431,
 'luck': 0.002467583124249431,
 '8-space': 0.0013714026759940711,
 'thursday': 0.0013714026759940711,
 'super': 0.02689635793237694,
 'practic': 0.024827407322194097,
 'honestli': 0.008416671890969478,
 'w/': 0.006919765324972041,
 'tascam': 0.00370137468637414

In [27]:
def create_review_tfidf_matrices(reviews_token, tfidf_tokens):
    review_tfidf_matrices = []

    for review in reviews_token:
        tfidf_vector = np.zeros(len(tfidf_tokens))
        
        for token in review:
            if token in tfidf_tokens:
                token_index = list(tfidf_tokens.keys()).index(token)
                tfidf_vector[token_index] = tfidf_tokens[token]

        tfidf_matrix = np.reshape(tfidf_vector, (1, -1))
        review_tfidf_matrices.append(tfidf_matrix)

    return review_tfidf_matrices

In [28]:
create_review_tfidf_matrices(tokens_in_doc, tf_idf_loaded)

[array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0.00841667, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),


In [29]:
with open('review_tfidf_matrices.pkl', 'wb') as f:
    pickle.dump(create_review_tfidf_matrices(tokens_in_doc, tf_idf_loaded), f)

In [30]:
with open('review_tfidf_matrices.pkl', 'rb') as f:
    review_tfidf_matrices_loaded = pickle.load(f)

In [31]:
review_tfidf_matrices_loaded

[array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0.00841667, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),


Question 3

In [32]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
def calculate_similarity(input_image, extracted_features):
    similarities = {}
    input_features = extracted_features[input_image]
    for image, features in extracted_features.items():
        if image != input_image:
            # print(image, features)
            similarity = cosine_similarity([input_features], [features])[0][0]
            similarities[image] = similarity
    return similarities

def retrieve_top_similar_images(similarities, top_n=3):
    similar_images = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return similar_images


In [34]:
def similar_reviews(input_review_tokens, tf_idf_dict, review_tfidf_matrices):

    tfidf_vector = np.zeros(len(tf_idf_dict))
        
    for token in input_review_tokens:
        if token in tf_idf_dict:
            token_index = list(tf_idf_dict.keys()).index(token)
            tfidf_vector[token_index] = tf_idf_dict[token]

    input_matrix = np.reshape(tfidf_vector, (1, -1))

    similarities = []

    for review_matrix in review_tfidf_matrices:
        # print(input_matrix, review_matrix)
        similarity = np.dot(input_matrix, review_matrix.T) / (np.linalg.norm(input_matrix) * np.linalg.norm(review_matrix))        
        similarities.append(similarity[0][0])
        # print(similarity)

    
    #get indices of top 3 most similar reviews
    top_similarities = np.argsort(similarities)

    return top_similarities, similarities

    

In [35]:
input_image = input("Enter the image link: ")
review = input("Enter a review: ")

input_preprocessed = preprocess_text(review)
print(input_preprocessed)

for i in index_to_image:
    if input_image in index_to_image[i]:

        for j in range(i+1):
            if j == input_image:
                break

        input_image_group_features = normalized_extracted_features_loaded[index_to_number[i]]
        # print(index_to_number[i])
        break

image_similiarities = calculate_similarity(input_image, link_to_features)

top_review_similarities, review_similarities = similar_reviews(input_preprocessed, tf_idf_loaded, review_tfidf_matrices_loaded)

n = int(input("Enter the number of top documents you want to receive: "))

#----------------------------------------------------------------------------------------------

print("USING IMAGE RETRIEVAL")

top_image_similarities = retrieve_top_similar_images(image_similiarities, n)
print(top_image_similarities)
indexes_of_similar_images = []
index_to_cosine_similarity = dict()

for i in index_to_image:
    index_to_cosine_similarity[i] = []
    for j in index_to_image[i]:
        if j in image_similiarities:
            index_to_cosine_similarity[i].append(image_similiarities[j])


for i in top_image_similarities:
    # print(i)
    for j in index_to_image:
        if i[0] in index_to_image[j]:
            for k in range(len(index_to_image[j])):
                if index_to_image[j][k] == i[0]:
                    indexes_of_similar_images.append([j, k])
              
                    
print(indexes_of_similar_images)
    
#print images and reviews in pairs
for i in indexes_of_similar_images:
    print(index_to_number[i[0]])
    print("Image URL: ", index_to_image[i[0]])
    print("Review: ", index_to_review[i[0]])
    print("Cosine similarity of image: ", index_to_cosine_similarity[i[0]][i[1]])
    print("Cosine similarity of text: ", review_similarities[i[0]])
    print("Composite similarity score: ", (index_to_cosine_similarity[i[0]][i[1]] + review_similarities[i[0]])/2)

#----------------------------------------------------------------------------------------------

print("USING REVIEW RETRIEVAL")

indexes_of_similar_reviews = []

for i in range(1, n+1):
    print(top_review_similarities[-1*i])
    indexes_of_similar_reviews.append(top_review_similarities[-1*i])

for i in indexes_of_similar_reviews:
    print(index_to_number[i])
    print("Image URL: ", index_to_image[i])
    print("Review: ", index_to_review[i])
    if index_to_cosine_similarity[i] == []:
        print("Cosine similarity of image: ", 0)
    else:
        print("Cosine similarity of image: ", index_to_cosine_similarity[i][0])
    print("Cosine similarity of text: ", review_similarities[i])
    if index_to_cosine_similarity[i] == []:
        print("Composite similarity score: ", review_similarities[i])
    else:
        print("Composite similarity score: ", (index_to_cosine_similarity[i][0] + review_similarities[i])/2)
    



['use', 'fender', 'lock', 'tuner', 'five', 'year', 'variou', 'strat', 'tele', 'definit', 'help', 'tune', 'stabil', 'way', 'faster', 'restr', 'break']
USING IMAGE RETRIEVAL
[('https://images-na.ssl-images-amazon.com/images/I/719-SDMiOoL._SY88.jpg', 0.6230473), ('https://images-na.ssl-images-amazon.com/images/I/61n284XL9HL._SY88.jpg', 0.5665882), ('https://images-na.ssl-images-amazon.com/images/I/71dCrR30OvL._SY88.jpg', 0.46467492), ('https://images-na.ssl-images-amazon.com/images/I/51OFdOanSXL._SY88.jpg', 0.41581237)]
[[655, 0], [578, 0], [541, 0], [997, 0]]
643
Image URL:  ['https://images-na.ssl-images-amazon.com/images/I/719-SDMiOoL._SY88.jpg']
Review:  These locking tuners look great and keep tune.  Good quality materials and construction.  Excellent upgrade to any guitar.  I had to drill additions holes for installation.  If your neck already comes with pre-drilled holes, then they should drop right in, otherwise you will need to buy a guitar tuner pin drill jig, also available fro

In [48]:
composite_similarity_images = dict()
composite_similarity_reviews = dict()

for i in indexes_of_similar_images:
    composite_similarity_images[index_to_number[i[0]]] = (index_to_cosine_similarity[i[0]][i[1]] + review_similarities[i[0]])/2
    
for i in indexes_of_similar_reviews:
    if index_to_cosine_similarity[i] == []:
        composite_similarity_reviews[index_to_number[i]] = review_similarities[i]
    else:
        composite_similarity_reviews[index_to_number[i]] = (index_to_cosine_similarity[i][0] + review_similarities[i])/2

In [51]:
composite_similarity_reviews

dict(sorted(composite_similarity_reviews.items(), key=lambda item: item[1], reverse=True))

{654: 1.0000000000000002,
 2486: 0.40172361254307026,
 1012: 0.2830697113843314,
 784: 0.256640144820117}

In [52]:
composite_similarity_images

dict(sorted(composite_similarity_images.items(), key=lambda item: item[1], reverse=True))

{173: 0.37690353863092074,
 643: 0.3736387168523741,
 1547: 0.33806406010012613,
 647: 0.33652332688276626}

In [56]:
combined_dict = {}
combined_dict.update(composite_similarity_reviews)
combined_dict.update(composite_similarity_images)
sorted_combined_dict = dict(sorted(combined_dict.items(), reverse= True, key=lambda item: item[1]))

sorted_combined_dict

{654: 1.0000000000000002,
 2486: 0.40172361254307026,
 173: 0.37690353863092074,
 643: 0.3736387168523741,
 1547: 0.33806406010012613,
 647: 0.33652332688276626,
 1012: 0.2830697113843314,
 784: 0.256640144820117}

In [60]:
print("Ranked combined scores: ")
for i in sorted_combined_dict:
    for j in index_to_number:
        if i == index_to_number[j]:
            print("Image URL: ", index_to_image[j])
            print("Review: ", index_to_review[j])
            print("Composite similarity score: ", sorted_combined_dict[i])
            print("Cosine similarity of image: ", index_to_cosine_similarity[j])
            print("Cosine similarity of text: ", review_similarities[j])

Ranked combined scores: 
Image URL:  ['https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg']
Review:  I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.
Composite similarity score:  1.0000000000000002
Cosine similarity of image:  []
Cosine similarity of text:  1.0000000000000002
Image URL:  ['https://images-na.ssl-images-amazon.com/images/I/81p58EEtEpL._SY88.jpg']
Review:  awesome! i had the old mini, but this orientation had more useful space! pictured here is my board.
Composite similarity score:  0.40172361254307026
Cosine similarity of image:  [0.24182291]
Cosine similarity of text:  0.5616243117970704
Image URL:  ['https://images-na.ssl-images-amazon.com/images/I/71dCrR30OvL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/81DND23Cm6L._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71Uu1cqMN-L._SY88.jpg']
Review

In [36]:
index_to_cosine_similarity

{0: [0.026796386],
 1: [-0.06951091, 0.09186613, 0.048264265],
 2: [0.06406147],
 3: [0.021613145],
 4: [-0.19952115],
 5: [0.018464338, 0.06253382, -0.0522016, -0.003764132],
 6: [0.0019053193],
 7: [-0.06043921, 0.035404578, 0.075741984, 0.110619105],
 8: [-0.102647044, -0.066792935, 0.07180886, -0.07265087],
 9: [0.12892812, 0.0035154605, -0.033366792],
 10: [-0.09870748, -0.052618764, -0.014653893],
 11: [0.0135776065],
 12: [0.115117356],
 13: [0.06823488],
 14: [-0.05148933, 0.067657575, -0.014398985, -0.07929986],
 15: [-0.06326409, -0.14577004],
 16: [0.014569651, -0.01065112],
 17: [0.017073208],
 18: [-0.061586984],
 19: [-0.19420381],
 20: [-0.14933692],
 21: [-0.17215446, -0.22947384, -0.19229645],
 22: [0.16928838],
 23: [0.12623703, 0.024332488],
 24: [0.059269693],
 25: [0.11520362, 0.26751512],
 26: [-0.08828735],
 27: [0.033193998, 0.0212331, 0.14831617],
 28: [-0.19200134, -0.042399496],
 29: [-0.08609671],
 30: [-0.04786461],
 31: [-0.09953095],
 32: [-0.06837633],
 

In [37]:
with open ('image_similarities.pkl', 'wb') as f:
    pickle.dump(image_similiarities, f)

In [38]:
with open('review_similarities.pkl', 'wb') as f:
    pickle.dump(review_similarities, f)

In [39]:
with open('index_to_cosine_similarity.pkl', 'wb') as f:
    pickle.dump(index_to_cosine_similarity, f)

In [40]:
with open('image_similarities.pkl', 'rb') as f:
    image_similarities_loaded = pickle.load(f)

In [41]:
with open('review_similarities.pkl', 'rb') as f:
    review_similarities_loaded = pickle.load(f)

In [42]:
with open('index_to_cosine_similarity.pkl', 'rb') as f:
    index_to_cosine_similarity_loaded = pickle.load(f)