In [42]:
import pandas as pd
import numpy as np
import json  
import string

import random



# Preparation
1. Caricamento del file csv 
2. Aggiungere tipi ai alle colonne
3. Filtro e pulizia dei dati
    - Prodotti con un review_upvote > 1 
    - Prodotti con un titolo diverso da nullo
    - Prodotti con marca diverso da null



In [43]:

csv_path = "./data/Reviews_With_Metadata.csv"
dtype_specification = {'review_upvote': str}
df = pd.read_csv(csv_path, dtype=dtype_specification)

# converto a numerico perchè senza da errore
df['review_upvote'] = pd.to_numeric(df['review_upvote'], errors='coerce')

# Filtro review sopra 1 e nome e brand non nulli, poi prodotti duplicati
filtered_df = df[(df['review_upvote'] > 1) & (df['product_title'].notnull()) & (df['product_brand'].notnull())]
filtered_df = filtered_df.drop_duplicates(subset='product_asin')

# genero id concatenando id prodotto e recensione
filtered_df['id'] = "Pair_" + filtered_df['reviewer_id'] + "-" + filtered_df['product_asin']


filtered_df.head()


Unnamed: 0,product_asin,reviewer_id,reviewer_name,review_score,review_time,review_time_unix,review_summary,review_upvote,review_text,product_title,product_brand,id
28,B011KQVV3C,A3RWJGUQ4DO775,Just Me,5,2016-08-26 00:00:00,1472169600,Make a Mark!,3.0,I bought these to draw a bright washable menu ...,"Crafty Croc Liquid Chalk Markers, 8 Pack Brigh...",Crafty Croc,Pair_A3RWJGUQ4DO775-B011KQVV3C
61,B011KSLS2Y,AAD4WMLPPV4U5,Exie,4,2016-03-12 00:00:00,1457740800,Its fit ok I wouldnt say the best but I hv sma...,2.0,Its fit ok I wouldnt say the best but I hv sma...,FOREVER YUNG Sexy Women's Lingerie Gauze Under...,Forever Yung,Pair_AAD4WMLPPV4U5-B011KSLS2Y
100,B011KY9776,A2OFEWH15SKIBS,Jaimers,3,2016-01-21 00:00:00,1453334400,"The work fine, but the handles slide off as yo...",2.0,"The work fine, but the handles slide off as yo...",LIHAO 9 Piece Set Ergonomic Crochet Hooks with...,LIHAO,Pair_A2OFEWH15SKIBS-B011KY9776
142,B011LM32UK,AGUDAON6OO40I,Jay T. Schultz,5,2016-01-19 00:00:00,1453161600,"Excellent model, great value.",3.0,"This is an exceptionally well crafted, well ma...",Human Model Craft Anatomy Skull Head Muscle Bo...,airgoesin,Pair_AGUDAON6OO40I-B011LM32UK
146,B011M2OACS,AJDPIWF0IUNGD,Amazon Customer,5,2015-10-11 00:00:00,1444521600,It takes some adjustment,23.0,"Like other reviewers, at first I was confounde...","Boye 3396300EGIKA 4-in-1 Crochet Hook Tool, Pu...",Boye,Pair_AJDPIWF0IUNGD-B011M2OACS


# Creazioni dei hits
Per generare i hits prima selezionammo un campione di 15 review in cui
ciascuna recensione deve far riferimento ad un prodotto diverso


In [60]:
sample = 15
selected_rows = filtered_df.sample(sample)
selected_rows.to_csv('./data/reviews.csv')
selected_rows.head()


Unnamed: 0,product_asin,reviewer_id,reviewer_name,review_score,review_time,review_time_unix,review_summary,review_upvote,review_text,product_title,product_brand,id
996120,B013RPG6KM,AA94TRFOUR7HO,Courtney H. Holland,5,2016-02-14 00:00:00,1455408000,Five Stars,3.0,Perfect for my jewelry making application.,STEEL BLOCK SWAGE U-CHANNEL &amp; 6 HAMMER PUN...,NOVELTOOLS,Pair_AA94TRFOUR7HO-B013RPG6KM
2033044,B00DQBWBPM,AIGWKUECXVIAP,RKG,5,2015-01-24 00:00:00,1422057600,Five Stars,4.0,Works perfect and fits my Janome New Home MC77...,Janome AcuFeed Ditch Quilting Foot,Janome,Pair_AIGWKUECXVIAP-B00DQBWBPM
761972,B001DIQ5SE,A1HY907Q1S3PQ2,Amazon Customer,5,2013-05-09 00:00:00,1368057600,Excellent pen for glitter,3.0,It dries quickly and easy to add a small about...,"Tombow 62175 Glue Pen (1 Piece), Multicolor"" />",Tombow,Pair_A1HY907Q1S3PQ2-B001DIQ5SE
705652,B001Q1DXBU,A31MF2SR479RFL,charles adams,3,2010-04-04 00:00:00,1270339200,Has weak spots.,2.0,It is fine for light duty work. When you go t...,Genuine Split Suede Leather Lace Cord 3mm Whit...,UnCommon Artistry,Pair_A31MF2SR479RFL-B001Q1DXBU
1967818,B00LBLVE5M,A3J58X1Z86ABG0,Kathy Broussard,5,2015-02-09 00:00:00,1423440000,but it's going to look great!! Perfect,2.0,"Haven't painted it yet, but it's going to look...",Unfinished Wood 3 Letter Vine Monogram 17.5'' ...,Laser Lizard,Pair_A3J58X1Z86ABG0-B00LBLVE5M


In [62]:
selected_rows = pd.read_csv('./data/reviews.csv')
reviewer_ids=selected_rows['reviewer_id'].to_list()
hits_table = pd.DataFrame()

def rotate(l, n):
    return l[n:] + l[:n]


def get_shift_matrix(n,m):
    if(m>n):
        raise Exception('m must be gretter than n')
    lista = []
    all_lists = []
    
    for i in range(0,n):
        lista.append(i)

    random.shuffle(lista)

    for i in range(0,m):
        tmp = rotate(lista,i)
        all_lists.append(tmp)

    hits_table = np.zeros((n,m))

    for i in range(0,m):
        hits_table[:, i] = all_lists[i]
    return hits_table

matrix_data= get_shift_matrix(sample,5)


i=1
hits=[]
for (row) in matrix_data:
    column=1
    hit_row = {'HIT_ID': f'HIT_{i}'}
    for index in row :
        columnName=f'Posizione_{column}'
        hit_row[columnName]=selected_rows.iloc[int(index)]['id']
        column+=1    
    i+=1
    hits_table = pd.concat([hits_table, pd.DataFrame([hit_row])], ignore_index=True)

hits_table.to_csv('./data/hits.csv')
hits_table.head(sample)

Unnamed: 0,HIT_ID,Posizione_1,Posizione_2,Posizione_3,Posizione_4,Posizione_5
0,HIT_1,Pair_AIPFQ5F25JFV3-B004D90FY0,Pair_A1ZIWE94JP5881-B011I87GXQ,Pair_A2KJQX9VHVHY12-B01FHNZXTW,Pair_A1CKLAJTWXD9WM-B00AZ4ITWW,Pair_AA94TRFOUR7HO-B013RPG6KM
1,HIT_2,Pair_A1ZIWE94JP5881-B011I87GXQ,Pair_A2KJQX9VHVHY12-B01FHNZXTW,Pair_A1CKLAJTWXD9WM-B00AZ4ITWW,Pair_AA94TRFOUR7HO-B013RPG6KM,Pair_A20WSZ029QZTV0-B01AG0LBRE
2,HIT_3,Pair_A2KJQX9VHVHY12-B01FHNZXTW,Pair_A1CKLAJTWXD9WM-B00AZ4ITWW,Pair_AA94TRFOUR7HO-B013RPG6KM,Pair_A20WSZ029QZTV0-B01AG0LBRE,Pair_A209U5K6AW8D5G-B01G5D6B26
3,HIT_4,Pair_A1CKLAJTWXD9WM-B00AZ4ITWW,Pair_AA94TRFOUR7HO-B013RPG6KM,Pair_A20WSZ029QZTV0-B01AG0LBRE,Pair_A209U5K6AW8D5G-B01G5D6B26,Pair_ATKOKG86BXBLW-B00CH9H1QI
4,HIT_5,Pair_AA94TRFOUR7HO-B013RPG6KM,Pair_A20WSZ029QZTV0-B01AG0LBRE,Pair_A209U5K6AW8D5G-B01G5D6B26,Pair_ATKOKG86BXBLW-B00CH9H1QI,Pair_A28D4972YVR48R-B001C0CT2O
5,HIT_6,Pair_A20WSZ029QZTV0-B01AG0LBRE,Pair_A209U5K6AW8D5G-B01G5D6B26,Pair_ATKOKG86BXBLW-B00CH9H1QI,Pair_A28D4972YVR48R-B001C0CT2O,Pair_AIGWKUECXVIAP-B00DQBWBPM
6,HIT_7,Pair_A209U5K6AW8D5G-B01G5D6B26,Pair_ATKOKG86BXBLW-B00CH9H1QI,Pair_A28D4972YVR48R-B001C0CT2O,Pair_AIGWKUECXVIAP-B00DQBWBPM,Pair_A3J58X1Z86ABG0-B00LBLVE5M
7,HIT_8,Pair_ATKOKG86BXBLW-B00CH9H1QI,Pair_A28D4972YVR48R-B001C0CT2O,Pair_AIGWKUECXVIAP-B00DQBWBPM,Pair_A3J58X1Z86ABG0-B00LBLVE5M,Pair_A1HY907Q1S3PQ2-B001DIQ5SE
8,HIT_9,Pair_A28D4972YVR48R-B001C0CT2O,Pair_AIGWKUECXVIAP-B00DQBWBPM,Pair_A3J58X1Z86ABG0-B00LBLVE5M,Pair_A1HY907Q1S3PQ2-B001DIQ5SE,Pair_A2H53T0PKZASO8-B000WWKGSW
9,HIT_10,Pair_AIGWKUECXVIAP-B00DQBWBPM,Pair_A3J58X1Z86ABG0-B00LBLVE5M,Pair_A1HY907Q1S3PQ2-B001DIQ5SE,Pair_A2H53T0PKZASO8-B000WWKGSW,Pair_A31MF2SR479RFL-B001Q1DXBU


## Creazione JSON per hits 

In [63]:
selected_rows = pd.read_csv('./data/reviews.csv')
selected_rows['review_time']=pd.to_datetime(selected_rows['review_time'])
hits_table = pd.read_csv('./data/hits.csv')

def get_token():
    return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(11))

def get_and_verify_token(tokens):
    token = get_token()
    while token in tokens:
        token= get_token()
    tokens.append(token)
    return token



hits=[]
tokens=[]
for index, row in hits_table.iterrows():
    hit={}

    hit['unit_id']=row['HIT_ID']
    hit['token_input']=get_and_verify_token(tokens)
    hit['token_output']=get_and_verify_token(tokens)
    hit['documents_number']=5
    hit['documents']=[]
    for column in range(1,6):
        doc={}
        columnName=f'Posizione_{column}'
        review = selected_rows.loc[selected_rows['id'] == row[columnName]].iloc[0]
        doc['id']=row[columnName]
        doc['product_title']=review['product_title']
        doc['product_brand']=review['product_brand']
        doc['review_text']=review['review_text']
        doc['review_time']=str(review['review_time'])
        hit['documents'].append(doc)
    hits.append(hit)
file_path = "hits.json"

# Write the dictionary to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(hits, json_file, indent=4)  

    



    
