In [1]:
# Imports
import numpy as np
import pandas as pd

import langchain
import torch

import huggingface_hub

In [2]:
import settings

In [3]:
langchain.__version__

'0.0.298'

In [4]:
# Read the example dataset
df = pd.read_csv("example_data/example.csv")

In [5]:
# See if it loaded correctly
df.head(5)

Unnamed: 0,FixTypos,Chunk
0,catt,jeep
1,hunter-,jeeping
2,huner,"jeep, vehicle: 4wd"
3,-alzheimer's patient,horseback
4,snowborder,horseback riding


In [6]:
# Ok, it loaded. Now let's check if the columns were read correctly
df.columns

Index(['FixTypos', 'Chunk'], dtype='object')

In [7]:
df["FixTypos"].unique()

array(['catt', 'hunter-', 'huner', "-alzheimer's patient", 'snowborder',
       'snowboarder', 'missing person', 'other-camper', 'bicylist',
       'dog_', 'aircraft   e-', 'flood***victimss'], dtype=object)

In [8]:
# these instruct-likes are more about text completion rather than responding to inputs
# they only produce desirable behavior when inference is NOT done in batches but instead is done throughout the entire list 
instruct_like = [  
    "HuggingFaceH4/zephyr-7b-beta",
    "tiiuae/falcon-7b-instruct",
    "mistralai/Mistral-7B-Instruct-v0.1",
    "mistralai/Mistral-7B-v0.1",
    "declare-lab/flan-alpaca-large",
    "bigcode/starcoder",
    "bigscience/bloom"
]

timeouts = [
    "01-ai/Yi-34B",
    "01-ai/Yi-6B",
    "openchat/openchat_3.5"
]

exihibits_somewhat_desired_behavior = [
    "google/flan-t5-xxl"
]

In [8]:
# Nodes
from prompt_tuning import PromptTuner
from prompts import typofix, chunker

# Models
from langchain.llms import HuggingFaceHub
from ml_models.apis import HUGGINGFACE_APIs as HF
from ml_models.models import HuggingFaceLLM # somewhat deprecated

#typofix_ptuner = PromptTuner(HuggingFaceLLM(api=HF.Falcon_7b_Instruct), typofix.FEW_SHOT_PROMPT)
#chunker_ptuner = PromptTuner(HuggingFaceLLM(api=HF.Falcon_7b_Instruct), chunker.FEW_SHOT_PROMPT)
default_kwargs = {"temperature": 0.5, "max_length": 500}  # 0.0 = most determinstic, 1.0 = most stochastic 


#typofix_ptuner = PromptTuner(HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs=default_kwargs), typofix.FEW_SHOT_PROMPT)
typofix_ptuner = PromptTuner(HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs=default_kwargs), typofix.FEW_SHOT_PROMPT)
chunker_ptuner = PromptTuner(HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs=default_kwargs), chunker.FEW_SHOT_PROMPT)



In [9]:
from chains import ChatChain

# Putting it all together
chain = typofix_ptuner.chain | chunker_ptuner.chain
final_chain = ChatChain(chain)  # for production

In [11]:
import data_cleaning as dcl

# Time to try!
#df_cleaned = dcl.clean_column(typofix_ptuner, df, "FixTypos")
#dcl.inference_clean(typofix_ptuner, df["FixTypos"].unique(), batch_size=3)
print(dcl.safe_chain_inference(typofix_ptuner, df["FixTypos"].unique()))
#dcl.raw_chain_inference(typofix_ptuner, ['flood***victimss', 'catt', 'hunter-'])

# Next test: Test on a validation set
# Next step: validator for spellchecking


AI: After:
[
`cat`
`hunter`
`hunter`
`alzheimer's patient`
`snowborder`
`snowboarder`
`missing person`
`bicyclist`
`dog`
`aircraft`
`flood victimss`
]
User 


In [14]:
dcl.safe_chain_inference(typofix_ptuner, df["FixTypos"].unique())

'`cat`\n`hunter`\n`hunter`\n`'

In [19]:
len(' \nAI: `cat`\n`hunter`\n`hunter`\n`')

31