In [2]:
import pandas as pd
import numpy as np
import json  
import uuid



# Preparation
1. Caricamento del file csv 
2. Aggiungere tipi ai alle colonne
3. Filtro e pulizia dei dati
    - Prodotti con un review_upvote > 1 
    - Prodotti con un titolo diverso da nullo
    - Prodotti con marca diverso da null



In [3]:

csv_path = "./data/Reviews_With_Metadata.csv"
dtype_specification = {'review_upvote': str}
df = pd.read_csv(csv_path, dtype=dtype_specification)

# converto a numerico perchè senza da errore
df['review_upvote'] = pd.to_numeric(df['review_upvote'], errors='coerce')

# Filtro review sopra 1 e nome e brand non nulli, poi tolgo duplicati
filtered_df = df[(df['review_upvote'] > 1) & (df['product_title'].notnull()) & (df['product_brand'].notnull())]
filtered_df = filtered_df.drop_duplicates(subset='product_asin')

# genero id concatenando id prodotto e recensione
filtered_df['unique_identifier'] = "Pair_" + filtered_df['reviewer_id'] + "-" + filtered_df['product_asin']


filtered_df.head()


Unnamed: 0,product_asin,reviewer_id,reviewer_name,review_score,review_time,review_time_unix,review_summary,review_upvote,review_text,product_title,product_brand,unique_identifier
28,B011KQVV3C,A3RWJGUQ4DO775,Just Me,5,2016-08-26 00:00:00,1472169600,Make a Mark!,3.0,I bought these to draw a bright washable menu ...,"Crafty Croc Liquid Chalk Markers, 8 Pack Brigh...",Crafty Croc,Pair_A3RWJGUQ4DO775-B011KQVV3C
61,B011KSLS2Y,AAD4WMLPPV4U5,Exie,4,2016-03-12 00:00:00,1457740800,Its fit ok I wouldnt say the best but I hv sma...,2.0,Its fit ok I wouldnt say the best but I hv sma...,FOREVER YUNG Sexy Women's Lingerie Gauze Under...,Forever Yung,Pair_AAD4WMLPPV4U5-B011KSLS2Y
100,B011KY9776,A2OFEWH15SKIBS,Jaimers,3,2016-01-21 00:00:00,1453334400,"The work fine, but the handles slide off as yo...",2.0,"The work fine, but the handles slide off as yo...",LIHAO 9 Piece Set Ergonomic Crochet Hooks with...,LIHAO,Pair_A2OFEWH15SKIBS-B011KY9776
142,B011LM32UK,AGUDAON6OO40I,Jay T. Schultz,5,2016-01-19 00:00:00,1453161600,"Excellent model, great value.",3.0,"This is an exceptionally well crafted, well ma...",Human Model Craft Anatomy Skull Head Muscle Bo...,airgoesin,Pair_AGUDAON6OO40I-B011LM32UK
146,B011M2OACS,AJDPIWF0IUNGD,Amazon Customer,5,2015-10-11 00:00:00,1444521600,It takes some adjustment,23.0,"Like other reviewers, at first I was confounde...","Boye 3396300EGIKA 4-in-1 Crochet Hook Tool, Pu...",Boye,Pair_AJDPIWF0IUNGD-B011M2OACS


# Creazioni dei hits
Per generare i hits prima selezionammo un campione di 15 review in cui
ciascuna recensione deve far riferimento ad un prodotto diverso

In [4]:

while True:
    selected_rows = filtered_df.sample(15)
    prodotti_ids = selected_rows['product_asin'].unique()
    if len(prodotti_ids) == 15:
        break
selected_rows.to_csv('./data/reviews.csv')
selected_rows.head()


Unnamed: 0,product_asin,reviewer_id,reviewer_name,review_score,review_time,review_time_unix,review_summary,review_upvote,review_text,product_title,product_brand,unique_identifier
501448,B00CF4O6K4,A24LKR7WEPN4J1,Lee J Johnson,1,2013-10-16 00:00:00,1381881600,NOT correct color,2.0,"I am very, VERY disappointed in this purchase....",Stretch Panne Velvet Green Fabric By The YD,The Fabric Exchange,Pair_A24LKR7WEPN4J1-B00CF4O6K4
1221516,B01BULDOKA,A2BI55OZ0CEFF4,Danielle,1,2016-07-23 00:00:00,1469232000,Not as vivid as shown online,5.0,The glue from the tape was causing the tapes t...,"me &amp; my BIG ideas Washi Tape Tube, Peony F...",Me & My Big Ideas,Pair_A2BI55OZ0CEFF4-B01BULDOKA
602930,B005S7FA28,A6618H5A4SDP9,Kathleen,3,2017-07-09 00:00:00,1499558400,Too big. Not so good for repairing bra hooks,3.0,Needed to repair bras. These are really too bi...,"Dritz Hooks &amp; Eyes Nickel Size 3,14 count",Dritz,Pair_A6618H5A4SDP9-B005S7FA28
8584,B012NTQLFI,A3ROI22EBUUOXT,stacy stamen,5,2016-04-12 00:00:00,1460419200,Five Stars,3.0,Great for bead making. Absorbs essential oils ...,Linsoir Beads White Volcano Lava Rock Beads Ro...,Linsoir beads,Pair_A3ROI22EBUUOXT-B012NTQLFI
370930,B004KYUG74,A27P9GPT6DBWMW,Alta R Hutton,5,2014-09-12 00:00:00,1410480000,Five Stars,2.0,making doing my corners much easier,Large Quick Easy Border Mitering Tool 1pc,ALDKitchen,Pair_A27P9GPT6DBWMW-B004KYUG74


In [4]:
selected_rows = pd.read_csv('./data/reviews.csv')
reviewer_ids=selected_rows['reviewer_id'].to_list()
hits_table = pd.DataFrame()


# Creo tabella
for hit_num in range(15):
    
    
    hit_row = {'HIT_ID': f'HIT_{hit_num + 1}'}
    
    # Assegnazione degli ID delle recensioni alla riga HIT
    reviews =[]
    
    for column in range(1,6):
        cpReviews = reviewer_ids.copy()
        columnName=f'Posizione_{column}'
        # Elimini tutte le review che gia  esistono nella colonna
        if columnName in hits_table.columns:
            for r in hits_table[columnName].values:
                if r not in cpReviews: break
                cpReviews.remove(r)
        # Ciclo finche non scelgo una review che non si trova nella riga
        while True:
            review = np.random.choice(cpReviews)
            # Ferma se la review esiste nella riga 
            if review not in reviews:
                break       
        
        hit_row[columnName] = review
        reviews.append(review)    
    hits_table = pd.concat([hits_table, pd.DataFrame([hit_row])], ignore_index=True)

hits_table.to_csv('./data/hits.csv')
hits_table.head(15)

Unnamed: 0,HIT_ID,Posizione_1,Posizione_2,Posizione_3,Posizione_4,Posizione_5
0,HIT_1,A6618H5A4SDP9,A27P9GPT6DBWMW,A24LKR7WEPN4J1,A28NTYA668X1R9,A3ROI22EBUUOXT
1,HIT_2,A3NAS4ZUEOAN12,A31EF21IMGXE7F,A27P9GPT6DBWMW,A6618H5A4SDP9,A4AQO1BZ7UN6X
2,HIT_3,A2BI55OZ0CEFF4,A32ID9TYXNSKRT,A3NAS4ZUEOAN12,A2OT67PK4ZYZ2F,A6618H5A4SDP9
3,HIT_4,A32ID9TYXNSKRT,A11WO3N0592D4C,A28NTYA668X1R9,A4AQO1BZ7UN6X,A2KNIUFBUYJJLX
4,HIT_5,A2OT67PK4ZYZ2F,A3NAS4ZUEOAN12,A6618H5A4SDP9,A3ROI22EBUUOXT,A28NTYA668X1R9
5,HIT_6,A3QOYOFJ6KOGLA,A2BI55OZ0CEFF4,A11WO3N0592D4C,AVHWXKJ7LESH3,A31EF21IMGXE7F
6,HIT_7,A28NTYA668X1R9,A24LKR7WEPN4J1,A32ID9TYXNSKRT,A2BI55OZ0CEFF4,A3NAS4ZUEOAN12
7,HIT_8,A3ROI22EBUUOXT,A3QOYOFJ6KOGLA,A2KNIUFBUYJJLX,A31EF21IMGXE7F,A32ID9TYXNSKRT
8,HIT_9,A11WO3N0592D4C,A28NTYA668X1R9,A4AQO1BZ7UN6X,A32ID9TYXNSKRT,A2BI55OZ0CEFF4
9,HIT_10,AVHWXKJ7LESH3,A2OT67PK4ZYZ2F,A31EF21IMGXE7F,A11WO3N0592D4C,A24LKR7WEPN4J1


## Creazione JSON per hits 

In [7]:
selected_rows = pd.read_csv('./data/reviews.csv')
selected_rows['review_time']=pd.to_datetime(selected_rows['review_time'])
hits_table = pd.read_csv('./data/hits.csv')
hits=[]
for index, row in hits_table.iterrows():
    hit={}
    hit['unit_id']=row['HIT_ID']
    hit['token_input']=f'HITINPUT_{index}'
    hit['token_output']=f'HITOUTPUT_{index}'
    hit['documents_number']=5
    hit['documents']=[]
    for column in range(1,6):
        doc={}
        columnName=f'Posizione_{column}'
        review = selected_rows.loc[selected_rows['reviewer_id'] == row[columnName]].iloc[0]
        doc['id']=row[columnName]
        doc['product_title']=review['product_title']
        doc['product_brand']=review['product_brand']
        doc['review_text']=review['review_text']
        doc['review_time']=str(review['review_time'])
        hit['documents'].append(doc)
    hits.append(hit)
file_path = "hits.json"

# Write the dictionary to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(hits, json_file, indent=4)  

    



    
