<!--
 * @Author: wangding wangding19@mails.ucas.ac.cn
 * @Date: 2023-08-12 10:11:20
 * @LastEditors: wangding wangding19@mails.ucas.ac.cn
 * @LastEditTime: 2023-08-12 10:12:37
 * @FilePath: \LangChainPlayGround\TutorialsByVedio\10.Miscs.ipynb
 * @Description: 
 * 
 * Copyright (c) 2023 by ${git_name_email}, All Rights Reserved. 
-->

# Miscellaneous of Langchain

In [None]:
import os, sys
import openai
import langchain
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) 


openai.api_base = os.environ["OPENAI_API_BASE"] # 换成代理，一定要加v1
openai.api_key = os.environ["OPENAI_API_KEY"]

from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

llm = OpenAI(
    temperature=0.8,
    model_name="gpt-3.5-turbo",
)
chat = ChatOpenAI(
    temperature=0.0,
    model_name = "gpt-3.5-turbo",
)
# os.environ["LANGCHAIN_WANDB_TRACING"] = "true"
# os.environ["WANDB_PROJECT"] = "langchainPlayGround"



## How_OpenAI_Count_Tokens

OpenAI使用`tiktoken`来拆分文本为token。该notebook介绍OpenAI是如何计数token的。

编码方法决定了不同的文本拆分Token的方式。OpenAI使用如下3个`tiktoken`支持的编码方法于不同的模型中：

1. cl100k_base: gpt-4, gpt-3.5-turbo, text-embedding-ada-002
2. p50k_base: text-davinci-002, text-davinci-003
3. r50k_base 或 gpt2: GPT-3模型，如davinci

In [None]:
import tiktoken

encoding = tiktoken.get_encoding("p50k_base")
encoding_for_model = tiktoken.encoding_for_model("gpt-4")

In [None]:
text_chinese = '你好，朋友'

print(encoding.encode(text_chinese))
print(encoding_for_model.encode(text_chinese))

print(encoding.decode([19526, 254, 25001, 121, 171, 120, 234, 17312, 233, 20998, 233]))
print(encoding_for_model.decode([57668, 53901, 3922, 4916, 233, 98915]))

In [None]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens




In [None]:
import openai

example_messages = [
    {
        "role": "system",
        "content": "你是翻译助理，请帮我将英文翻译成中文，谢谢。请只回复翻译文字，不要回复其他内容。",
    },
    {
        "role": "user",
        "name": "Alice",
        "content": "The sky is blue.",
    },
]

for model in ["gpt-3.5-turbo-0301", "gpt-4-0314"]:
    print(model)
    # 来自上述实现的函数的token计数
    print(f"{num_tokens_from_messages(example_messages, model)} prompt tokens counted by num_tokens_from_messages().")
    # 来自OpenAI API的token计数
    response = openai.ChatCompletion.create(
        model=model,
        messages=example_messages,
        temperature=0,
        max_tokens=1  # 仅返回用于计数的token数量，因此不需要API返回completion内容
    )
    print()
    print(f'{response["usage"]["prompt_tokens"]} prompt tokens counted by the OpenAI API.')
    print()

## Connecting OpenAI with Apache Spark

Introduction of [pyspark-ai](https://github.com/databrickslabs/pyspark-ai)

Pyspark-AI takes English instructions and compile them into PySpark objects like DataFrames, to make Spark more user-friendly and accessible, allowing you to focus on extracting insights from your data.

Requires Java Env !!!!

In [None]:
# import os
# os.environ['JAVA_HOME'] = "/usr/local/jdk1.8.0_221"   # 记得把地址改成自己的

In [None]:
from langchain.chat_models import ChatOpenAI
from pyspark_ai import SparkAI

# If 'gpt-4' is unavailable, use 'gpt-3.5-turbo' (might lower output quality)
# llm = ChatOpenAI(model_name='gpt-4', temperature=0)
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

spark_ai = SparkAI(llm=llm, verbose=True)

# Activate partial functions for Spark DataFrame
# spark_ai.activate()

In [None]:

# 2. Create a dataframe via a HTTP URL
# In this case, we are fetching the share holders of Apple, one of the best performing stock in US market.

holders_dataframe = spark_ai.create_df("https://finance.yahoo.com/quote/AAPL/holders?p=AAPL")

holders_dataframe.show(n=5)


holders_dataframe.ai.plot()

## 3. 