In [None]:
!pip install redis
!pip install python-dotenv
!pip install slugify
!pip install pydantic
!pip install tqdm

In [35]:
%env REDIS_HOST=127.0.0.1
%env REDIS_PORT=6379
%env REDIS_PASSWORD=

env: REDIS_HOST=127.0.0.1
env: REDIS_PORT=6379
env: REDIS_PASSWORD=


In [49]:
import os

from dotenv import load_dotenv
from redis import Redis
from redis.commands.search.field import NumericField, TextField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.suggestion import Suggestion


In [37]:
load_dotenv()

redis_client = Redis(
    host=os.environ["REDIS_HOST"],
    port=int(os.environ["REDIS_PORT"]),
    password=os.environ.get("REDIS_PASSWORD"),
)

os.environ["REDIS_HOST"]

'127.0.0.1'

# Initialize the index

In [51]:
schema = (
    TextField("$.title", as_name="title", weight=10.0),
    TextField("$.description", as_name="description"),
    NumericField("$.date", as_name="date"),
    TextField("$.category", as_name="category"),
)

redis_client.ft("articles").create_index(
    schema, definition=IndexDefinition(prefix=["articles:"], index_type=IndexType.JSON)
)

b'OK'

In [48]:
redis_client.ft("articles").dropindex()

b'OK'

# Import Data

In [40]:
import datetime
import json
import os
from typing import Union
from redis.commands.json.path import Path
from pydantic import BaseModel
from slugify import slugify
from tqdm import tqdm

In [41]:

class Article(BaseModel):
    id: str
    title: str
    url: str
    category: str
    description: str
    date: Union[str, datetime.datetime, int]
    authors: str

    class Config:
        json_encoders = {datetime.datetime: lambda v: int(v.timestamp())}

DATA_PATH = "../data/News_Category_Dataset_v3.json"


In [52]:
with tqdm(total=os.path.getsize(DATA_PATH)) as pbar:
    with open(DATA_PATH, "r") as f:
        for line in f:
            pbar.update(len(line))
            data = json.loads(line)
            id = slugify(data["title"])
            try:
                data["id"] = id
                data["date"] = datetime.datetime.strptime(data["date"], "%Y-%m-%d")
                article = Article(**data)
                article_dict = json.loads(article.json())
                redis_client.json().set(f"articles:{article.id}", Path.root_path(), article_dict)
                # Add autocomplete suggestion
                result = redis_client.ft("articles").sugadd(
                    "articles", Suggestion(string=article.title, score=article_dict["date"])
                )
            except Exception as e:
                continue

  0%|          | 0/7830349 [00:00<?, ?it/s]/var/folders/7c/tthb1grn2ynbpnrs4ht8jqnr0000gn/T/ipykernel_29579/2725058047.py:11: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.3/migration/
  article_dict = json.loads(article.json())
  1%|          | 40739/7830349 [00:00<00:19, 406894.66it/s]  1%|          | 81769/7830349 [00:00<00:19, 398844.14it/s]  2%|▏         | 124169/7830349 [00:00<00:18, 410124.26it/s]  2%|▏         | 182014/7830349 [00:00<00:16, 475758.42it/s]  3%|▎         | 239391/7830349 [00:00<00:14, 510011.66it/s]  4%|▍         | 297743/7830349 [00:00<00:14, 533939.33it/s]  5%|▍         | 354844/7830349 [00:00<00:13, 544421.09it/s]  5%|▌         | 411032/7830349 [00:00<00:13, 549678.89it/s]  6%|▌         | 469106/7830349 [00:00<00:13, 558617.95it/s]  7%|▋         | 527059/7830349 [00:01<00:12, 564271.45it/s] 