From 2d0b3d9123fd503e21faf96f374d444ab7fb5eeb Mon Sep 17 00:00:00 2001
From: Benjamin Piouffle <benjamin@captainfact.io>
Date: Wed, 26 Jun 2024 16:21:49 +0200
Subject: [PATCH] feat: Use LLM to extract statements

---
 apps/cf/lib/llms/statements_creator.ex        | 215 ++++++++++++++++++
 .../statements_extractor_system_prompt.eex    |  52 +++++
 .../statements_extractor_user_prompt.eex      |  11 +
 apps/cf/mix.exs                               |   2 +
 apps/cf_jobs/config/config.exs                |   4 +-
 mix.lock                                      |   1 +
 6 files changed, 283 insertions(+), 2 deletions(-)
 create mode 100644 apps/cf/lib/llms/statements_creator.ex
 create mode 100644 apps/cf/lib/llms/templates/statements_extractor_system_prompt.eex
 create mode 100644 apps/cf/lib/llms/templates/statements_extractor_user_prompt.eex

diff --git a/apps/cf/lib/llms/statements_creator.ex b/apps/cf/lib/llms/statements_creator.ex
new file mode 100644
index 00000000..74f26987
--- /dev/null
+++ b/apps/cf/lib/llms/statements_creator.ex
@@ -0,0 +1,215 @@
+defmodule CF.LLMs.StatementsCreator do
+  @moduledoc """
+  Functions to create statements from a video that has captions using LLMs
+  """
+
+  import Ecto.Query
+  require EEx
+  require Logger
+
+  @max_caption_length 1000
+
+  @model_lama_3_small %{
+    name: "llama-3-sonar-small-32k-chat",
+    parameter_count: "8B",
+    context_length: 32768
+  }
+
+  @model_lama_3_large %{
+    name: "llama-3-sonar-large-32k-chat",
+    parameter_count: "70B",
+    context_length: 32768
+  }
+
+  @model_mistral_7b %{
+    name: "mistral-7b-instruct",
+    parameter_count: "8x7B",
+    context_length: 16384
+  }
+
+  # Load prompt messages templates
+  EEx.function_from_file(
+    :defp,
+    :generate_system_prompt,
+    Path.join(__DIR__, "templates/statements_extractor_system_prompt.eex")
+  )
+
+  EEx.function_from_file(
+    :defp,
+    :generate_user_prompt,
+    Path.join(__DIR__, "templates/statements_extractor_user_prompt.eex"),
+    [
+      :video,
+      :captions
+    ]
+  )
+
+  @doc """
+  Create statements from a video that has captions using LLMs
+  """
+  def process_video!(video_id) do
+    DB.Schema.Video
+    |> join(:inner, [v], vc in DB.Schema.VideoCaption, on: v.id == vc.video_id)
+    |> where([v, vc], v.id == ^video_id)
+    |> order_by([v, vc], desc: vc.inserted_at)
+    |> limit(1)
+    |> select([v, vc], {v, vc})
+    |> DB.Repo.one()
+    |> case do
+      nil ->
+        raise "Video or captions not found"
+
+      {video, video_caption} ->
+        video_caption.parsed
+        |> chunk_captions()
+        |> Enum.map(fn captions ->
+          video
+          |> get_llm_suggested_statements(captions)
+          |> filter_known_statements(video)
+          |> create_statements_from_inputs(video)
+          |> broadcast_statements(video)
+
+          Process.sleep(500)
+        end)
+    end
+  end
+
+  @doc """
+  Chunk captions everytime we reach the max caption length
+  """
+  defp chunk_captions(captions) do
+    # TODO: Base on strings lengths + @max_caption_length
+    Enum.chunk_every(captions, 50)
+  end
+
+  defp get_llm_suggested_statements(video, captions, retries \\ 0) do
+    try do
+      headers = [
+        {"Authorization", "Bearer #{Application.get_env(:openai, :api_key)}"},
+        {"Content-Type", "application/json"},
+        {"Accept", "application/json"}
+      ]
+
+      system_prompt = generate_system_prompt()
+      user_prompt = generate_user_prompt(video, captions)
+
+      body =
+        %{
+          "model" => @model_lama_3_large[:name],
+          "max_tokens" =>
+            @model_lama_3_large[:context_length] -
+              String.length(system_prompt) - String.length(user_prompt) - 500,
+          "stream" => false,
+          "messages" => [
+            %{
+              "role" => "system",
+              "content" => system_prompt
+            },
+            %{
+              "role" => "user",
+              "content" => user_prompt
+            }
+          ]
+        }
+        |> Jason.encode!()
+
+      case HTTPoison.post("https://api.perplexity.ai/chat/completions", body, headers,
+             timeout: 30_000,
+             recv_timeout: 30_000
+           ) do
+        {:ok, %HTTPoison.Response{status_code: 200, body: body}} ->
+          body
+          |> Jason.decode!()
+          |> Map.get("choices")
+          |> List.first()
+          |> get_in(["message", "content"])
+          |> get_json_str_from_content!()
+          |> Jason.decode!()
+          |> Map.get("statements")
+          |> check_statements_input_format!()
+
+        {:ok, %HTTPoison.Response{status_code: status_code, body: body}} ->
+          raise "Network error: #{status_code} - #{inspect(body)}"
+
+        {:error, %HTTPoison.Error{reason: reason}} ->
+          raise inspect(reason)
+      end
+    rescue
+      error ->
+        if retries > 0 do
+          Logger.warn("Failed to get LLM suggested statements: #{inspect(error)}. Retrying...")
+          Process.sleep(1000)
+          get_llm_suggested_statements(video, captions, retries - 1)
+        else
+          Logger.error(inspect(error))
+          reraise error, __STACKTRACE__
+        end
+    end
+  end
+
+  defp check_statements_input_format!(statements_inputs) do
+    for %{"text" => text, "time" => time} <- statements_inputs do
+      unless is_binary(text) and is_integer(time) do
+        raise "Invalid statement input format"
+      end
+    end
+
+    statements_inputs
+  end
+
+  # Remove statements when we already have a similar one at time/text
+  defp filter_known_statements(statements, video) do
+    existing_statements =
+      DB.Schema.Statement
+      |> where([s], s.video_id == ^video.id)
+      |> DB.Repo.all()
+
+    Enum.reject(statements, fn %{"text" => text, "time" => time} ->
+      Enum.any?(existing_statements, fn s ->
+        s.time >= time - 5 and s.time <= time + 5 and String.jaro_distance(s.text, text) > 0.80
+      end)
+    end)
+  end
+
+  defp create_statements_from_inputs(statements_inputs, video) do
+    # TODO: Check if the statement doesn't already exist
+    # TODO: Record a reference to the caption that generated the statement
+    inserted_at = NaiveDateTime.utc_now() |> NaiveDateTime.truncate(:second)
+
+    {nb_statements, statements} =
+      DB.Repo.insert_all(
+        DB.Schema.Statement,
+        Enum.map(statements_inputs, fn %{"text" => text, "time" => time} ->
+          %{
+            video_id: video.id,
+            text: text,
+            time: time,
+            inserted_at: inserted_at,
+            updated_at: inserted_at
+          }
+        end),
+        returning: true
+      )
+
+    statements
+  end
+
+  defp broadcast_statements(statements, video) do
+    statements
+    |> Enum.map(fn statement ->
+      CF.RestApi.Endpoint.broadcast(
+        "statements:video:#{DB.Type.VideoHashId.encode(video.id)}",
+        "statement_added",
+        CF.RestApi.StatementView.render("show.json", statement: statement)
+      )
+    end)
+  end
+
+  # JSON content can optionally be wrapped in a ```json ... ``` block
+  defp get_json_str_from_content!(content) do
+    case Regex.scan(~r/```json\n(.+)\n```/mis, content) do
+      [[_, json_str]] -> json_str
+      _ -> content
+    end
+  end
+end
diff --git a/apps/cf/lib/llms/templates/statements_extractor_system_prompt.eex b/apps/cf/lib/llms/templates/statements_extractor_system_prompt.eex
new file mode 100644
index 00000000..42e3be83
--- /dev/null
+++ b/apps/cf/lib/llms/templates/statements_extractor_system_prompt.eex
@@ -0,0 +1,52 @@
+# Mission
+
+Ta tâche est d'extraire des citations intéressantes à vérifier depuis les sous-titres d'une vidéo, ainsi que le timecode du 1er mot de la citation. Le texte peut contenir des fautes ou des mots mal reconnus, tu les corrigeras. Tu peux aussi résumer ou remplacer certaines parties non-essentielles par "[...]" pour raccourcir la citation.
+
+Renvoie uniquement le résultat en JSON, **sans aucun commentaire ni conclusion**.
+
+# Comment choisir les extraits à vérifier
+
+Pour être pertinente, une citation doit :
+- être vérifiable grâce à l'exposition de faits
+- faire mention d'une source ou d'un contenu que l'on peut vérifier
+Et remplir au moins un des critères suivants :
+- présenter des éléments incomplets ou approximatifs
+- présenter un argument fallacieux, trompeur ou mensonger
+- présenter des informations intéressantes à vérifier
+
+Ne méritent pas travail de vérification :
+- les évidences comme "Le ciel est bleu !"
+- les figures de style et l'humour (comme les hyperboles, les métaphores, etc)
+- les erreurs mineures
+- les opinions personnelles ("j'aime ça")
+
+# Exemple
+
+## Input
+
+```json
+{
+  "video": {
+    "title": "Thinkerview - La diagonale du vide en France"
+  },
+  "captions": [
+    { "start": 10, "text": "Cette mesure sociale a été un désastre de la pensée ça ne m'évoque que du dégoût elle n'a fait que créer une augmentation du chômage, c'est pour moi une pure folie" },
+    { "start": 85, "text": "il y a d'autres zones en France qui sont très peuplées elle s'affiche ici et juste là et oui je sais effectivement je pense que je peux tenter une" },
+    { "start": 89, "text": "reconversion à devenir présentateur météo" },
+    { "start": 94, "text": "dans les zones que vous voyez ici on compte seulement 6,5% de la population française métropolitaine pourtant et bien ces espaces" },
+    { "start": 102, "text": "représentent 42% du territoire national mais alors pourquoi la diagonale du vide comme" }
+  ]
+}
+```
+
+## Output
+
+```json
+{
+  "statements": [
+    { "time": 10, "text": "Cette mesure sociale [...] n'a fait que créer une augmentation du chômage" },
+    { "time": 94, "text": "ici on compte seulement 6,5% de la population française métropolitaine" },
+    { "time": 94, "text": "ces espaces représentent 42% du territoire national" }
+  ],
+}
+```
diff --git a/apps/cf/lib/llms/templates/statements_extractor_user_prompt.eex b/apps/cf/lib/llms/templates/statements_extractor_user_prompt.eex
new file mode 100644
index 00000000..466c9848
--- /dev/null
+++ b/apps/cf/lib/llms/templates/statements_extractor_user_prompt.eex
@@ -0,0 +1,11 @@
+```json
+{
+  "video": {
+    "title": "<%= video.id %>"
+  },
+  "captions": <%= captions |> Enum.map(fn caption -> %{
+    "start": floor(caption["start"]),
+    "text": String.trim(caption["text"])
+  } end) |> Jason.encode! %>
+}
+```
diff --git a/apps/cf/mix.exs b/apps/cf/mix.exs
index 6407e5dc..ab46dfc8 100644
--- a/apps/cf/mix.exs
+++ b/apps/cf/mix.exs
@@ -58,6 +58,8 @@ defmodule CF.Mixfile do
       {:sweet_xml, "~> 0.6"},
       {:burnex, "~> 3.1"},
       {:yaml_elixir, "~> 2.9.0"},
+      {:openai, "~> 0.6.1"},
+      {:jason, "~> 1.4"},
 
       # ---- Internal ----
       {:db, in_umbrella: true},
diff --git a/apps/cf_jobs/config/config.exs b/apps/cf_jobs/config/config.exs
index 76aada30..cb522d10 100644
--- a/apps/cf_jobs/config/config.exs
+++ b/apps/cf_jobs/config/config.exs
@@ -41,8 +41,8 @@ config :cf_jobs, CF.Jobs.Scheduler,
     ],
     # Captions
     download_captions: [
-      # every 10 minutes
-      schedule: "*/10 * * * *",
+      # every minute
+      schedule: "*/1 * * * *",
       task: {CF.Jobs.DownloadCaptions, :update, []},
       overlap: false
     ]
diff --git a/mix.lock b/mix.lock
index dc02fd81..bbb4b769 100644
--- a/mix.lock
+++ b/mix.lock
@@ -78,6 +78,7 @@
   "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
   "not_qwerty123": {:hex, :not_qwerty123, "2.2.1", "656e940159517f2d2f07ea0bb14e4ad376d176b5f4de07115e7a64902b5e13e3", [:mix], [{:gettext, "~> 0.13", [hex: :gettext, repo: "hexpm", optional: false]}], "hexpm", "7637173b09eb7b26b29925039d5b92f7107c94a27cbe4d2ba8efb8b84d060c4b"},
   "oauth2": {:hex, :oauth2, "0.9.4", "632e8e8826a45e33ac2ea5ac66dcc019ba6bb5a0d2ba77e342d33e3b7b252c6e", [:mix], [{:hackney, "~> 1.7", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "407c6b9f60aa0d01b915e2347dc6be78adca706a37f0c530808942da3b62e7af"},
+  "openai": {:hex, :openai, "0.6.1", "ad86b5b253969fe6d59896d295b1a573cbe44d586fd00bfa8cf3f440d800b4d6", [:mix], [{:httpoison, "~> 2.0", [hex: :httpoison, repo: "hexpm", optional: false]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "aea82953ea82fcbf91d0474125943becf5d8318af53081ed722a0f26d4346353"},
   "parallel_stream": {:hex, :parallel_stream, "1.0.6", "b967be2b23f0f6787fab7ed681b4c45a215a81481fb62b01a5b750fa8f30f76c", [:mix], [], "hexpm", "639b2e8749e11b87b9eb42f2ad325d161c170b39b288ac8d04c4f31f8f0823eb"},
   "parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
   "phoenix": {:hex, :phoenix, "1.5.14", "2d5db884be496eefa5157505ec0134e66187cb416c072272420c5509d67bf808", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.13 or ~> 3.0", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.0", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.10", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.2", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.1.2 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "207f1aa5520320cbb7940d7ff2dde2342162cf513875848f88249ea0ba02fef7"},