In [None]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import json
import re
from genresLists import TVList, movieList


In [None]:
client = Elasticsearch('http://localhost:9200')


In [None]:
model = SentenceTransformer('all-MPNet-base-v2')


In [None]:
client.info()


In [None]:
mapping = {
    "properties": {
       
        "title": {
            "type": "text"
        },
        "overview": {
            "type": "text"
        },
        "poster_path": {
            "type": "keyword"
        },
        "backdrop_path": {
            "type": "keyword"
        },
        "rate": {
            "type": "float"
        },
        "date": {
            "type": "date"
        },
        "type": {
            "type": "keyword"
        },
        "embedding_vector": {
            "type": "dense_vector",
            "dims": 768 
        },

    }
}


In [None]:
client.indices.delete(index='movies', ignore_unavailable=True)
client.indices.create(
    index='movies',
    mappings=mapping,
)


In [None]:
movies_file_json_path = '../data/movies.data.json'
tv_file_json_path = '../data/TV.data.json'


In [None]:
documents = []
with open(movies_file_json_path, 'r') as file:
    movies = json.load(file)
    documents.extend(movies['movies'])
with open(tv_file_json_path, 'r') as file:
    series = json.load(file)
    documents.extend(series['tv'])

In [None]:
# prepocessing 
unique_documents = []
delete_count = 0

for doc in documents:
    if doc not in unique_documents:  
        unique_documents.append(doc)
    else:
        delete_count += 1

print("Count of deleted items:", delete_count)
print("Unique documents:", unique_documents)
documents=unique_documents


In [None]:
def handleGenres(type: str, genres_ids: list[int]) -> list[str]:
    genres = []
    if type == 'movie':
        for el in movieList:
            if el['id'] in genres_ids:
                genres.append(el['name'])
    elif type == 'tv':
        for el in TVList:
            if el['id'] in genres_ids:
                genres.append(el['name'])
    return genres 


In [None]:

index=0
   
for el in documents:
  try: 
     
     doc = {
          "title": re.sub(r'[",\\]', '', el.get('title','')),
          "overview": re.sub(r'[",\\]', '', el.get('overview')),
          "genres": handleGenres(el['type'], el.get('genres',[])),
          "poster_path": el.get("poster_path",'') ,
          "backdrop_path":  el.get("backdrop_path",'') ,
          "rate": el.get('rate',0),
          "type": el.get('type',''),
          "date": el.get('date',''),
          "embedding_vector": model.encode(el.get('overview',' ')),
          "tmdb_id":el.get('id'),
      }

     client.index(
          index='movies',
          document=doc
      )
     index+=1  
  except Exception as e:
        print(f"An error occurred while adding document number. {index}: {e}")


In [None]:
client.count(index='movies')

In [None]:
len(documents)