In [1]:
import csv
import json
import uuid
import os

raw_reviews_file = "Data/hotel_reviews_1000.csv"
transformed_dir = "Data/transformed"

raw_reviews = open(raw_reviews_file, "r").readlines()

if not os.path.exists(transformed_dir):
    os.makedirs(transformed_dir)

def process_reviews(file_path):
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        # Read the first line to get the header
        header = next(csv.reader(csvfile))
        
        # Create a mapping of expected column names to actual column names
        column_mapping = {
            'dateAdded': 'dateAdded',
            'city': 'city',
            'hotel_name': 'name',
            'hotel_state': 'province',
            'review_text': 'reviews.text',
            'review_title': 'reviews.title'
        }
        
        # Find the index of each required column
        column_indices = {}
        for expected_name, actual_name in column_mapping.items():
            try:
                column_indices[expected_name] = header.index(actual_name)
            except ValueError:
                print(f"Warning: Column '{actual_name}' not found in the CSV. Some data may be missing.")
        
        # Reset file pointer to the beginning
        csvfile.seek(0)
        
        # Skip the header row
        next(csvfile)
        
        # Use csv.reader instead of DictReader
        reader = csv.reader(csvfile)
        
        for i, row in enumerate(reader, start=1):
            review_json = {}
            for key, index in column_indices.items():
                if index < len(row):
                    review_json[key] = row[index]
                else:
                    review_json[key] = ""  # or None, depending on your preference
            
            # Generate a unique identifier
            review_json['id'] = str(uuid.uuid4())
            
            # print(json.dumps(review_json, indent=2))
            print(f"processed record [{i}] with id [{review_json['id']}]")

            with open(f"{transformed_dir}/review_{i}.json", "w+") as f:
                json.dump(review_json, f, indent=2)
            
process_reviews(raw_reviews_file)

processed record [1] with id [73c4ebbd-532f-43b8-bed1-302e24913f4e]
processed record [2] with id [1d5e2a43-86ee-43b1-8866-2216936293bd]
processed record [3] with id [b49e41fb-9bbd-4f9e-aca0-3faaccf087ff]
processed record [4] with id [f42dae17-6a25-4cba-a8eb-7b1450ec5595]
processed record [5] with id [ad682dd7-0f1a-4034-9b47-44862fb97062]
processed record [6] with id [be455b1e-3801-42d5-aaeb-2928e11f05ac]
processed record [7] with id [a9bcbbf2-8b86-4a95-b02a-7aba575ce731]
processed record [8] with id [a69efc02-39aa-48a8-82c4-6c325eaaeca7]
processed record [9] with id [f6084a2b-2ea6-4a4b-b84d-6ab9ab82f3ae]
processed record [10] with id [eec35fb5-ebfa-4264-be9c-6560f415aa44]
processed record [11] with id [a97f16b7-67c7-4a4c-a094-af13ee5607d9]
processed record [12] with id [f701f4eb-0b7b-47c6-9555-e655c893d6c7]
processed record [13] with id [3d18ba21-7549-41f3-8bd9-ffaae9d556ac]
processed record [14] with id [d6e5273b-ee5c-4e26-a19f-bb0a1455a624]
processed record [15] with id [11c5e4a1-4fc

In [7]:
## Step 2: Create Embeddings for each of the JSON Files

In [2]:
from dotenv import load_dotenv
load_dotenv("myenv/.env")

True

In [6]:
from dotenv import load_dotenv
from openai import OpenAI
client = OpenAI()

response = client.embeddings.create(
    input="Hello world",
    model="text-embedding-3-small"
)

print(len(response.data[0].embedding))
print(response.data[0].embedding)

1536
[-0.002119066, -0.04909009, 0.021010067, 0.031322982, -0.0453092, -0.026404751, -0.028971454, 0.060248327, -0.025713125, -0.014808485, 0.015446318, -0.030078055, -0.020395288, -0.03341323, 0.025820712, 0.014255185, -0.07002331, 0.01236474, 0.014816171, 0.04884418, 0.020764155, -0.008829761, -0.015069767, -0.016614398, 0.025959037, -0.002841431, -0.024391351, 0.024283765, 0.0018039917, -0.055729702, 0.023069577, -0.045462895, -0.008714491, 0.0031622688, 0.004526309, 0.0017943857, 0.026696771, 0.010159221, -0.0119958725, -0.011519419, -0.014885333, -0.023146424, 0.025390366, 0.036825255, -0.035503477, 0.021286719, -0.06307631, 0.040390972, 0.0535165, 0.061508626, -0.033689883, -0.0066818777, 0.025451845, 0.10967655, -0.0047030584, -0.03953028, 0.0070968536, 0.051518466, -0.026327904, 0.02789559, 0.030400814, 0.020564353, 0.017259916, 0.012403163, 0.0010691389, 0.0070968536, -0.037071165, 0.02353066, -0.010674098, 0.040821318, -0.0020844846, 0.03136909, -0.04272713, -0.006758725, -0.

In [8]:
import os
import json

transformed_dir = "Data/transformed"
embedded_dir = "Data/embedded"

if not os.path.exists(embedded_dir):
    os.makedirs(embedded_dir)
    
def prepare_embedding_str(review_json):
    return f"REVIEW_TITLE: {review_json['review_title']} REVIEW_TEXT: {review_json['review_text']} HOTEL_NAME: {review_json['hotel_name']} HOTEL_CITY: {review_json['city']} HOTEL_STATE: {review_json['hotel_state']}"
    
client = OpenAI()
for file in os.listdir(transformed_dir):
    with open(f"{transformed_dir}/{file}", "r") as f:
        review = json.load(f)
        
        ## start here
        embedding_str = prepare_embedding_str(review)
        response = client.embeddings.create(
            input=embedding_str,
            model="text-embedding-3-small"
        )
        
        review['embedding'] = response.data[0].embedding
        
        with open(f"{embedded_dir}/{file}", "w") as f:
            json.dump(review, f, indent=2)

In [None]:
#Now we are ready to crate a Search Index in Azure AI Search