In [1]:
import gzip
import pandas as pd
import json

In [2]:
def read_jsonl_gz(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    return pd.DataFrame(data)

# Load the review dataset
reviews_df = read_jsonl_gz('/content/drive/MyDrive/Maestria MNA/3er_Trimestre/01_Big Data/04_Week 4/Digital_Music.jsonl.gz')

# Load the product specifications dataset
product_specs_df = read_jsonl_gz('/content/drive/MyDrive/Maestria MNA/3er_Trimestre/01_Big Data/04_Week 4/meta_Digital_Music.jsonl.gz')

In [3]:
reviews_df = reviews_df.sample(n=20000, random_state=42)

In [4]:
product_specs_df = product_specs_df.sample(n=5000, random_state=42)

In [5]:
reviews_df.shape

(20000, 10)

In [6]:
product_specs_df.shape

(5000, 14)

In [7]:
merged_df = pd.merge(reviews_df, product_specs_df, on='parent_asin')

In [8]:
merged_df.head()

Unnamed: 0,rating,title_x,text,images_x,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,rating_number,features,description,price,images_y,videos,store,categories,details,bought_together
0,5.0,"great, dirty, crunchy, catchy, very overlooked...",Some great hooks from this band- the whole alb...,[],B01JTC6KQC,B01JTC6KQC,AGFRM3V53WN6P6B5NZBWRAWHBYCQ,1613231688410,0,False,...,1,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Stanley Format: Audio CD,[],{'Manufacturer': 'Another Planet/Profile Recor...,
1,5.0,FABULOUS SELECTION!,This is one of the better collections out ther...,[],B0009YC9SI,B0009YC9SI,AHZYNY3PLZWGTANJR5TPNUJLITNA,1375985864000,0,False,...,2,[],[This CD was surposed to be about the year 193...,1.6,[{'thumb': 'https://m.media-amazon.com/images/...,[],Format: Audio CD,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",
2,5.0,Terrific disc and all the tracks are ALSO in B...,Terrific disc! And don't worry if you can get ...,[],B0009JBMXQ,B0009JBMXQ,AEDCFBGM7K52DBV2C27TB4YS5RUQ,1548954771554,0,False,...,8,[],[4-Song CD Sold Only at Theatre! Betty Buckley...,,[{'thumb': 'https://m.media-amazon.com/images/...,[],Andrew Lloyd Webber Format: Audio CD,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",
3,5.0,Phenomenal CD Showcase,"It may only include four songs from ""Sunset Bo...",[],B0009JBMXQ,B0009JBMXQ,AF5B5ZWUGLC27PS7T3V4SLIC4UTQ,1368678836000,2,True,...,8,[],[4-Song CD Sold Only at Theatre! Betty Buckley...,,[{'thumb': 'https://m.media-amazon.com/images/...,[],Andrew Lloyd Webber Format: Audio CD,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",
4,5.0,An outstanding book by an outstanding author,This is one of my all time favorite Salman Rus...,[],1436170117,1436170117,AE47AFM4GCXRZHITF3KC2KDT46ZQ,1361795862000,0,True,...,5797,[],[],79.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],Salman Rushdie Format: Audio CD,[],{'Package Dimensions': '7.5 x 6.3 x 2.3 inches...,




In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# Combine review text and product description for each product
merged_df['combined_features'] = merged_df['text'] + ' ' + merged_df['title_y']

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a Series to map 'parent_asin' to product index
indices = pd.Series(merged_df.index, index=merged_df['parent_asin'])

def get_recommendations(parent_asin, cosine_sim=cosine_sim, num_recommendations=5):
    # Get the index of the product that matches the parent_asin
    idx = indices[parent_asin]

    # Get the pairwise similarity scores of all products with that product
    sim_scores = cosine_sim[idx]

    # Get the indices of the most similar products
    product_indices = sim_scores.argsort()[-num_recommendations-1:-1]

    # Return the top most similar products
    return merged_df['parent_asin'].iloc[product_indices]

# Example: Get recommendations for a specific product (e.g., parent_asin = 'B01JTC6KQC')
parent_asin = 'B01JTC6KQC'
recommended_products = get_recommendations(parent_asin)
print("Recommended products:", recommended_products)


Recommended products: 1327    B000J0O7I2
483     B004I00GNY
1188    B003TTL87S
1004    B00000ADK3
1042    B0013VDPMK
Name: parent_asin, dtype: object
