In [1]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
import pandas as pd
import json

d:\Anaconda3\envs\LLM\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
d:\Anaconda3\envs\LLM\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
import os
import nltk

import os, sys
import openai
import langchain
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) 


openai.api_base = os.environ["OPENAI_API_BASE"] # 换成代理，一定要加v1
openai.api_key = os.environ["OPENAI_API_KEY"]

from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

llm = OpenAI(
    temperature=0.0,
    model_name="gpt-3.5-turbo",
)
# Turn your texts into embeddings
embedding_model = OpenAIEmbeddings()

# nltk.download('averaged_perceptron_tagger')

# pip install unstructured
# Other dependencies to install https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html
# pip install python-magic-bin
# pip install chromadb



In [3]:
# Temp = 0 so that we get clean information without a lot of creativity
chat_model = ChatOpenAI(temperature=0 , max_tokens=1000)

In [4]:
# How you would like your response structured. This is basically a fancy prompt template
response_schemas = [
    ResponseSchema(name="input_industry", description="This is the input_industry from the user"),
    ResponseSchema(name="standardized_industry", description="This is the industry you feel is most closely matched to the users input"),
    ResponseSchema(name="match_score",  description="A score 0-100 of how close you think the match is between user input and your match")
]

# How you would like to parse your output
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [5]:
# See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()
print (output_parser.get_format_instructions())

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"input_industry": string  // This is the input_industry from the user
	"standardized_industry": string  // This is the industry you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```


In [6]:
template = """
You will be given a series of industry names from a user.
Find the best corresponding match on the list of standardized names.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

{format_instructions}

Wrap your final output with closed and open brackets (a list of json objects)

input_industry INPUT:
{user_industries}

STANDARDIZED INDUSTRIES:
{standardized_industries}

YOUR RESPONSE:
"""

prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(template)  
    ],
    input_variables=["user_industries", "standardized_industries"],
    partial_variables={"format_instructions": format_instructions}
)

In [7]:
# Get your standardized names. You can swap this out with whatever list you want!
df = pd.read_csv('../data/LinkedInIndustries.csv')
standardized_industries = ", ".join(df['Industry'].values)
standardized_industries

'Corporate Services, Recreation & Travel, Legal, Wellness & Fitness, Entertainment, Consumer Goods, Design, Arts, Manufacturing, Finance, Health Care, Construction, Nonprofit, Real Estate, Software & IT Services, Hardware & Networking, Agriculture, Education, Public Administration, Transportation & Logistics, Public Safety, Media & Communications, Energy & Mining, Retail'

In [8]:
# Your user input

user_input = "air LineZ, airline, aviation, planes that fly, farming, bread, wifi networks, twitter media agency"

_input = prompt.format_prompt(user_industries=user_input, standardized_industries=standardized_industries)


print (f"There are {len(_input.messages)} message(s)")
print (f"Type: {type(_input.messages[0])}")
print ("---------------------------")
print (_input.messages[0].content)

There are 1 message(s)
Type: <class 'langchain.schema.messages.HumanMessage'>
---------------------------

You will be given a series of industry names from a user.
Find the best corresponding match on the list of standardized names.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"input_industry": string  // This is the input_industry from the user
	"standardized_industry": string  // This is the industry you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```

Wrap your final output with closed and open brackets (a list of json objects)

input_industry INPUT:
air LineZ, airline, aviation, planes that fly, farming, bread, wifi networks, twitter media agency

STANDARDIZED INDUSTRI

In [9]:
output = chat_model(_input.to_messages())

In [10]:
print(type(output))
print(output.content)

<class 'langchain.schema.messages.AIMessage'>
```json
[
	{
		"input_industry": "air LineZ",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "90"
	},
	{
		"input_industry": "airline",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "95"
	},
	{
		"input_industry": "aviation",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "95"
	},
	{
		"input_industry": "planes that fly",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "85"
	},
	{
		"input_industry": "farming",
		"standardized_industry": "Agriculture",
		"match_score": "100"
	},
	{
		"input_industry": "bread",
		"standardized_industry": "Consumer Goods",
		"match_score": "100"
	},
	{
		"input_industry": "wifi networks",
		"standardized_industry": "Hardware & Networking",
		"match_score": "100"
	},
	{
		"input_industry": "twitter media agency",
		"standardized_industry": "Media & Communications",
		"match_score": "100"
	}
]
```


In [16]:
if "```json" in output.content:
    json_string = output.content.split("```json")[1].split("```")[0].strip()
else:
    json_string = output.content
    
print(json_string)

[
	{
		"input_industry": "air LineZ",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "90"
	},
	{
		"input_industry": "airline",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "95"
	},
	{
		"input_industry": "aviation",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "95"
	},
	{
		"input_industry": "planes that fly",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "85"
	},
	{
		"input_industry": "farming",
		"standardized_industry": "Agriculture",
		"match_score": "100"
	},
	{
		"input_industry": "bread",
		"standardized_industry": "Consumer Goods",
		"match_score": "100"
	},
	{
		"input_industry": "wifi networks",
		"standardized_industry": "Hardware & Networking",
		"match_score": "100"
	},
	{
		"input_industry": "twitter media agency",
		"standardized_industry": "Media & Communications",
		"match_score": "100"
	}
]


In [17]:
print(output.content)

```json
[
	{
		"input_industry": "air LineZ",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "90"
	},
	{
		"input_industry": "airline",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "95"
	},
	{
		"input_industry": "aviation",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "95"
	},
	{
		"input_industry": "planes that fly",
		"standardized_industry": "Transportation & Logistics",
		"match_score": "85"
	},
	{
		"input_industry": "farming",
		"standardized_industry": "Agriculture",
		"match_score": "100"
	},
	{
		"input_industry": "bread",
		"standardized_industry": "Consumer Goods",
		"match_score": "100"
	},
	{
		"input_industry": "wifi networks",
		"standardized_industry": "Hardware & Networking",
		"match_score": "100"
	},
	{
		"input_industry": "twitter media agency",
		"standardized_industry": "Media & Communications",
		"match_score": "100"
	}
]
```


In [18]:
# output_parser.parse(output.content) #Ideally this works but not in all cases
structured_data = json.loads(json_string)
structured_data

[{'input_industry': 'air LineZ',
  'standardized_industry': 'Transportation & Logistics',
  'match_score': '90'},
 {'input_industry': 'airline',
  'standardized_industry': 'Transportation & Logistics',
  'match_score': '95'},
 {'input_industry': 'aviation',
  'standardized_industry': 'Transportation & Logistics',
  'match_score': '95'},
 {'input_industry': 'planes that fly',
  'standardized_industry': 'Transportation & Logistics',
  'match_score': '85'},
 {'input_industry': 'farming',
  'standardized_industry': 'Agriculture',
  'match_score': '100'},
 {'input_industry': 'bread',
  'standardized_industry': 'Consumer Goods',
  'match_score': '100'},
 {'input_industry': 'wifi networks',
  'standardized_industry': 'Hardware & Networking',
  'match_score': '100'},
 {'input_industry': 'twitter media agency',
  'standardized_industry': 'Media & Communications',
  'match_score': '100'}]

In [19]:
pd.DataFrame(structured_data)

Unnamed: 0,input_industry,standardized_industry,match_score
0,air LineZ,Transportation & Logistics,90
1,airline,Transportation & Logistics,95
2,aviation,Transportation & Logistics,95
3,planes that fly,Transportation & Logistics,85
4,farming,Agriculture,100
5,bread,Consumer Goods,100
6,wifi networks,Hardware & Networking,100
7,twitter media agency,Media & Communications,100


#### To Do
1. Look at new incoming industries from the user
2. Match against your data base of values you've already mapped
3. For existing ones, save an API call and get the result from the data base
4. For new ones, batch them together for your LLM to return back to you