In [None]:
!pip install openai
!pip install python-dotenv
!pip install gradio==3.48.0
!pip install pytubefix

In [None]:
from openai import OpenAI
from dotenv import dotenv_values
config = dotenv_values('.env')
client = OpenAI(api_key=config["API_KEY"])

In [None]:
from pytubefix import YouTube, Playlist

In [None]:
def get_audio_text(audio_path, title):
  audio = open(audio_path, "rb")               # 讀取音檔
  res = client.audio.transcriptions.create(          # 利用audioAPI將音檔轉換為文字
      model="whisper-1",
      file=audio,
      prompt=title                      # 以影片標題引導模型的指示
  )
  return res.text                        # 回傳影片音檔文字

In [None]:
def get_video_title_text(video_url, audio_name):
  video = YouTube(video_url)                  # 創建YT影片
  stream = video.streams.filter(only_audio=True).first()    # 取得YT影片音檔
  audio_path = stream.download(filename=audio_name)       # 將音檔下載至本地端
  text = get_audio_text(audio_path, video.title)        # 取得文字
  return video.title, text                   # 回傳影片標題及文字

In [None]:
def get_playlist_info(playlist_url):
  playlist = Playlist(playlist_url)                  # 取得playlist裡所有影片的網址
  videos_info = {}
  for idx, video_url in enumerate(playlist):
    title, text = get_video_title_text(video_url, f"{idx}.mp3")  # 取得所有影片標題及文字
    videos_info[title] = text                    # 存放所有影片標題及文字
  return videos_info                          # 回傳所有影片標題及文字

In [None]:
playlist_url = "https://www.youtube.com/playlist?list=PLrZrfGLGySzcZoVhb4idy5B0XI25ZhnF7" # 引用YouTuber柴鼠兄弟的playList
playlist_info = get_playlist_info(playlist_url)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(list(playlist_info.items()), columns=["title", "text"])
df.to_csv("video_text.csv", index=False)
df = pd.read_csv("video_text.csv")

In [None]:
def split_text(all_text, title):                # 由於模型能容納的token數有限，原本一段影片的總token數遠超模型限制，所以要將每句話分割出來，分次embedding轉換
  text_list = all_text.split(' ')              # 把每句話分開
  text = title                        # 分開後前面加上title也就是後面引導模型的指示
  new_text_list = []                     # 接收加上title後的新text_list
  for idx, i in enumerate(text_list):            # 利用for loop昨上述任務
    text += f",{i}"
    if (idx+1)%50==0 or idx==len(text_list)-1:       # 每50句話一組或是做到最後一組就將結果存入new_text_list
      new_text_list.append(text)
      text = title
  return new_text_list

In [None]:
split_text_list = []
for idx, text in enumerate(df["text"].values):        # 把playlist的所有影片分割
  split_text_list += split_text(text, df["title"][idx])

df = pd.DataFrame(split_text_list, columns=['split_text_list'])

In [None]:
def get_embedding(text):
  res = client.embeddings.create(            # 利用模型將文字向量化
      model="text-embedding-ada-002",
      input=text                    # 輸入欲轉換文字
  )
  return res.data[0].embedding              # 回傳轉換結果

In [None]:
split_text_embeddings = [get_embedding(i) for i in df["split_text_list"]]   # 將每段文字轉換成向量
df["embeddings"] = split_text_embeddings          # 創建新的column，將向量化後的結果展現出來

In [None]:
from typing import List
from scipy import spatial
import numpy as np



def distances_from_embeddings(
    query_embedding: List[float],
    embeddings: List[List[float]],
    distance_metric="cosine",
) -> List[List]:
    """Return the distances between a query embedding and a list of embeddings."""
    distance_metrics = {
        "cosine": spatial.distance.cosine,
        "L1": spatial.distance.cityblock,
        "L2": spatial.distance.euclidean,
        "Linf": spatial.distance.chebyshev,
    }
    distances = [
        distance_metrics[distance_metric](query_embedding, embedding)
        for embedding in embeddings
    ]
    return distances


def indices_of_nearest_neighbors_from_distances(distances) -> np.ndarray:
    """Return a list of indices of nearest neighbors from a list of distances."""
    return np.argsort(distances)

In [None]:
def finance(question):
  question_embeddings = get_embedding(question)
  dist = distances_from_embeddings(question_embeddings, df["embeddings"])
  nearest_idx = indices_of_nearest_neighbors_from_distances(dist)
  nearest_text = ""
  for i in range(2):
    nearest_text += df["split_text_list"][nearest_idx[i]] + '\n'

  prompt = f"""
  你是我的投資理財顧問，請根據以下內容回答此問題:{question}
  如果沒有100%確定，就回答'我不知道'

  ###
  內容:
  {nearest_text}
  ###

  """
  res = client.completions.create(
      model="gpt-3.5-turbo-instruct",
      prompt=prompt,
      max_tokens=500,
      temperature=0
  )
  return res.choices[0].text

In [None]:
import gradio as gr

demo = gr.Interface(
    fn=finance,
    inputs="text",
    outputs="text",
    title="投資理財顧問",
    description="輸入您的問題:",
    allow_flagging="never"
)

demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
IMPORTANT: You are using gradio version 3.48.0, however version 4.29.0 is available, please upgrade.
--------
Running on public URL: https://9d3dd1fca12ef97ccd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://9d3dd1fca12ef97ccd.gradio.live


