Skip to content

Commit

Permalink
feat: Use LLM to extract statements
Browse files Browse the repository at this point in the history
  • Loading branch information
Betree committed Jun 26, 2024
1 parent f8d23cb commit 2d0b3d9
Show file tree
Hide file tree
Showing 6 changed files with 283 additions and 2 deletions.
215 changes: 215 additions & 0 deletions apps/cf/lib/llms/statements_creator.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
defmodule CF.LLMs.StatementsCreator do
@moduledoc """
Functions to create statements from a video that has captions using LLMs
"""

import Ecto.Query
require EEx
require Logger

@max_caption_length 1000

@model_lama_3_small %{
name: "llama-3-sonar-small-32k-chat",
parameter_count: "8B",
context_length: 32768
}

@model_lama_3_large %{
name: "llama-3-sonar-large-32k-chat",
parameter_count: "70B",
context_length: 32768
}

@model_mistral_7b %{
name: "mistral-7b-instruct",
parameter_count: "8x7B",
context_length: 16384
}

# Load prompt messages templates
EEx.function_from_file(
:defp,
:generate_system_prompt,
Path.join(__DIR__, "templates/statements_extractor_system_prompt.eex")
)

EEx.function_from_file(
:defp,
:generate_user_prompt,
Path.join(__DIR__, "templates/statements_extractor_user_prompt.eex"),
[
:video,
:captions
]
)

@doc """
Create statements from a video that has captions using LLMs
"""
def process_video!(video_id) do
DB.Schema.Video
|> join(:inner, [v], vc in DB.Schema.VideoCaption, on: v.id == vc.video_id)
|> where([v, vc], v.id == ^video_id)
|> order_by([v, vc], desc: vc.inserted_at)
|> limit(1)
|> select([v, vc], {v, vc})
|> DB.Repo.one()
|> case do
nil ->
raise "Video or captions not found"

{video, video_caption} ->
video_caption.parsed
|> chunk_captions()
|> Enum.map(fn captions ->
video
|> get_llm_suggested_statements(captions)
|> filter_known_statements(video)
|> create_statements_from_inputs(video)
|> broadcast_statements(video)

Process.sleep(500)
end)
end
end

@doc """
Chunk captions everytime we reach the max caption length
"""
defp chunk_captions(captions) do
# TODO: Base on strings lengths + @max_caption_length
Enum.chunk_every(captions, 50)
end

defp get_llm_suggested_statements(video, captions, retries \\ 0) do
try do
headers = [
{"Authorization", "Bearer #{Application.get_env(:openai, :api_key)}"},
{"Content-Type", "application/json"},
{"Accept", "application/json"}
]

system_prompt = generate_system_prompt()
user_prompt = generate_user_prompt(video, captions)

body =
%{
"model" => @model_lama_3_large[:name],
"max_tokens" =>
@model_lama_3_large[:context_length] -
String.length(system_prompt) - String.length(user_prompt) - 500,
"stream" => false,
"messages" => [
%{
"role" => "system",
"content" => system_prompt
},
%{
"role" => "user",
"content" => user_prompt
}
]
}
|> Jason.encode!()

case HTTPoison.post("https://api.perplexity.ai/chat/completions", body, headers,
timeout: 30_000,
recv_timeout: 30_000
) do
{:ok, %HTTPoison.Response{status_code: 200, body: body}} ->
body
|> Jason.decode!()
|> Map.get("choices")
|> List.first()
|> get_in(["message", "content"])
|> get_json_str_from_content!()
|> Jason.decode!()
|> Map.get("statements")
|> check_statements_input_format!()

{:ok, %HTTPoison.Response{status_code: status_code, body: body}} ->
raise "Network error: #{status_code} - #{inspect(body)}"

{:error, %HTTPoison.Error{reason: reason}} ->
raise inspect(reason)
end
rescue
error ->
if retries > 0 do
Logger.warn("Failed to get LLM suggested statements: #{inspect(error)}. Retrying...")
Process.sleep(1000)
get_llm_suggested_statements(video, captions, retries - 1)
else
Logger.error(inspect(error))
reraise error, __STACKTRACE__
end
end
end

defp check_statements_input_format!(statements_inputs) do
for %{"text" => text, "time" => time} <- statements_inputs do
unless is_binary(text) and is_integer(time) do
raise "Invalid statement input format"
end
end

statements_inputs
end

# Remove statements when we already have a similar one at time/text
defp filter_known_statements(statements, video) do
existing_statements =
DB.Schema.Statement
|> where([s], s.video_id == ^video.id)
|> DB.Repo.all()

Enum.reject(statements, fn %{"text" => text, "time" => time} ->
Enum.any?(existing_statements, fn s ->
s.time >= time - 5 and s.time <= time + 5 and String.jaro_distance(s.text, text) > 0.80
end)
end)
end

defp create_statements_from_inputs(statements_inputs, video) do
# TODO: Check if the statement doesn't already exist
# TODO: Record a reference to the caption that generated the statement
inserted_at = NaiveDateTime.utc_now() |> NaiveDateTime.truncate(:second)

{nb_statements, statements} =
DB.Repo.insert_all(
DB.Schema.Statement,
Enum.map(statements_inputs, fn %{"text" => text, "time" => time} ->
%{
video_id: video.id,
text: text,
time: time,
inserted_at: inserted_at,
updated_at: inserted_at
}
end),
returning: true
)

statements
end

defp broadcast_statements(statements, video) do
statements
|> Enum.map(fn statement ->
CF.RestApi.Endpoint.broadcast(
"statements:video:#{DB.Type.VideoHashId.encode(video.id)}",
"statement_added",
CF.RestApi.StatementView.render("show.json", statement: statement)
)
end)
end

# JSON content can optionally be wrapped in a ```json ... ``` block
defp get_json_str_from_content!(content) do
case Regex.scan(~r/```json\n(.+)\n```/mis, content) do
[[_, json_str]] -> json_str
_ -> content
end
end
end
52 changes: 52 additions & 0 deletions apps/cf/lib/llms/templates/statements_extractor_system_prompt.eex
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Mission

Ta tâche est d'extraire des citations intéressantes à vérifier depuis les sous-titres d'une vidéo, ainsi que le timecode du 1er mot de la citation. Le texte peut contenir des fautes ou des mots mal reconnus, tu les corrigeras. Tu peux aussi résumer ou remplacer certaines parties non-essentielles par "[...]" pour raccourcir la citation.

Renvoie uniquement le résultat en JSON, **sans aucun commentaire ni conclusion**.

# Comment choisir les extraits à vérifier

Pour être pertinente, une citation doit :
- être vérifiable grâce à l'exposition de faits
- faire mention d'une source ou d'un contenu que l'on peut vérifier
Et remplir au moins un des critères suivants :
- présenter des éléments incomplets ou approximatifs
- présenter un argument fallacieux, trompeur ou mensonger
- présenter des informations intéressantes à vérifier

Ne méritent pas travail de vérification :
- les évidences comme "Le ciel est bleu !"
- les figures de style et l'humour (comme les hyperboles, les métaphores, etc)
- les erreurs mineures
- les opinions personnelles ("j'aime ça")

# Exemple

## Input

```json
{
"video": {
"title": "Thinkerview - La diagonale du vide en France"
},
"captions": [
{ "start": 10, "text": "Cette mesure sociale a été un désastre de la pensée ça ne m'évoque que du dégoût elle n'a fait que créer une augmentation du chômage, c'est pour moi une pure folie" },
{ "start": 85, "text": "il y a d'autres zones en France qui sont très peuplées elle s'affiche ici et juste là et oui je sais effectivement je pense que je peux tenter une" },
{ "start": 89, "text": "reconversion à devenir présentateur météo" },
{ "start": 94, "text": "dans les zones que vous voyez ici on compte seulement 6,5% de la population française métropolitaine pourtant et bien ces espaces" },
{ "start": 102, "text": "représentent 42% du territoire national mais alors pourquoi la diagonale du vide comme" }
]
}
```

## Output

```json
{
"statements": [
{ "time": 10, "text": "Cette mesure sociale [...] n'a fait que créer une augmentation du chômage" },
{ "time": 94, "text": "ici on compte seulement 6,5% de la population française métropolitaine" },
{ "time": 94, "text": "ces espaces représentent 42% du territoire national" }
],
}
```
11 changes: 11 additions & 0 deletions apps/cf/lib/llms/templates/statements_extractor_user_prompt.eex
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
```json
{
"video": {
"title": "<%= video.id %>"
},
"captions": <%= captions |> Enum.map(fn caption -> %{
"start": floor(caption["start"]),
"text": String.trim(caption["text"])
} end) |> Jason.encode! %>
}
```
2 changes: 2 additions & 0 deletions apps/cf/mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ defmodule CF.Mixfile do
{:sweet_xml, "~> 0.6"},
{:burnex, "~> 3.1"},
{:yaml_elixir, "~> 2.9.0"},
{:openai, "~> 0.6.1"},
{:jason, "~> 1.4"},

# ---- Internal ----
{:db, in_umbrella: true},
Expand Down
4 changes: 2 additions & 2 deletions apps/cf_jobs/config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ config :cf_jobs, CF.Jobs.Scheduler,
],
# Captions
download_captions: [
# every 10 minutes
schedule: "*/10 * * * *",
# every minute
schedule: "*/1 * * * *",
task: {CF.Jobs.DownloadCaptions, :update, []},
overlap: false
]
Expand Down
1 change: 1 addition & 0 deletions mix.lock
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
"nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
"not_qwerty123": {:hex, :not_qwerty123, "2.2.1", "656e940159517f2d2f07ea0bb14e4ad376d176b5f4de07115e7a64902b5e13e3", [:mix], [{:gettext, "~> 0.13", [hex: :gettext, repo: "hexpm", optional: false]}], "hexpm", "7637173b09eb7b26b29925039d5b92f7107c94a27cbe4d2ba8efb8b84d060c4b"},
"oauth2": {:hex, :oauth2, "0.9.4", "632e8e8826a45e33ac2ea5ac66dcc019ba6bb5a0d2ba77e342d33e3b7b252c6e", [:mix], [{:hackney, "~> 1.7", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "407c6b9f60aa0d01b915e2347dc6be78adca706a37f0c530808942da3b62e7af"},
"openai": {:hex, :openai, "0.6.1", "ad86b5b253969fe6d59896d295b1a573cbe44d586fd00bfa8cf3f440d800b4d6", [:mix], [{:httpoison, "~> 2.0", [hex: :httpoison, repo: "hexpm", optional: false]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "aea82953ea82fcbf91d0474125943becf5d8318af53081ed722a0f26d4346353"},
"parallel_stream": {:hex, :parallel_stream, "1.0.6", "b967be2b23f0f6787fab7ed681b4c45a215a81481fb62b01a5b750fa8f30f76c", [:mix], [], "hexpm", "639b2e8749e11b87b9eb42f2ad325d161c170b39b288ac8d04c4f31f8f0823eb"},
"parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
"phoenix": {:hex, :phoenix, "1.5.14", "2d5db884be496eefa5157505ec0134e66187cb416c072272420c5509d67bf808", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.13 or ~> 3.0", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.0", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.10", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.2", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.1.2 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "207f1aa5520320cbb7940d7ff2dde2342162cf513875848f88249ea0ba02fef7"},
Expand Down

0 comments on commit 2d0b3d9

Please sign in to comment.