In [1]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
import pandas as pd
import numpy as np
import json

In [2]:
openai_api_key = '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'

In [4]:
# Temp = 0 so that we get clean information without a lot of creativity
chat_model = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, max_tokens=1000)

In [5]:
# How you would like your response structured. This is basically a fancy prompt template
response_schemas = [
    ResponseSchema(name="input_names", description="This is the list of existing name variations from the user"),
    ResponseSchema(name="matching_name", description="This is the name you feel is most closely matched to the users input"),
    ResponseSchema(name="match_score",  description="A score 0-100 of how close you think the match is between user input and your match")
]

# How you would like to parse your output
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [6]:
# See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()
print (output_parser.get_format_instructions())

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"input_names": string  // This is the list of existing name variations from the user
	"matching_name": string  // This is the name you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```


In [7]:
template = """
You will be given a series of name variations from a user.
Find the best corresponding match on the list of target names.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

{format_instructions}

Wrap your final output with closed and open brackets (a list of json objects)

input_names INPUT:
{user_names}

TARGET MATCH:
{target_names}

YOUR RESPONSE:
"""

prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(template)  
    ],
    input_variables=["user_names", "target_names"],
    partial_variables={"format_instructions": format_instructions}
)

# Loading datasets

## Fuzzy multi-variate matching

In [116]:
hd = pd.read_csv('handmade22.csv', index_col=0)[['own', 'plant_name_x', 'fuel_type', 'capacity_mw','costs_name']]
hd.rename(columns={'plant_name_x':'plant_name','costs_name':'plant_name_alt'}, inplace=True)
hd.columns

Index(['own', 'plant_name', 'fuel_type', 'capacity_mw', 'plant_name_alt'], dtype='object')

In [117]:
olds = pd.read_excel('bpdb_selected.xlsx', sheet_name="oldies")
olds.columns

Index(['plant_name', 'capacity', 'own', 'fuel_type', 'start', 'end'], dtype='object')

In [118]:
hd["fuel_type"] = ["HFO" if x=="F.OIL" else x for x in hd.fuel_type.str.upper().values]
olds["fuel_type"] = olds.fuel_type.str.upper()

In [119]:
hd = hd.reset_index()
hd.index = hd.index.set_names(['hd_index'])
olds = olds.reset_index()
olds.index = olds.index.set_names(['olds_index'])

import recordlinkage
indexer = recordlinkage.Index()
indexer.full()
candidates = indexer.index(hd, olds)
compare = recordlinkage.Compare()
compare.numeric('capacity_mw', 'capacity', label="cap")
compare.string('plant_name',
            'plant_name',
            threshold=0.6,
            label='Plant_name_1')
compare.string('plant_name_alt',
            'plant_name',
            threshold=0.6,
            label='Plant_name_2')
compare.string('fuel_type', 'fuel_type', threshold=0.85, label="fuel_type")
features = compare.compute(candidates, hd,
                        olds)



In [120]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

3.0      14
2.0     127
1.5      19
1.0    2705
0.5      31
0.0    7114
Name: count, dtype: int64

In [121]:
potential_matches = features[features.sum(axis=1) > 1].reset_index()
potential_matches['Score'] = potential_matches.loc[:, 'cap':'fuel_type'].sum(axis=1)
potential_matches["hd_name"] = [hd.loc[i].plant_name for i in potential_matches.hd_index]
potential_matches["olds_name"] = [str(olds.loc[i].plant_name) for i in potential_matches.olds_index]

In [122]:
potential_matches.sort_values(by=["Score"], ascending=False).loc[(potential_matches["Plant_name_1"]>0)|(potential_matches["Plant_name_2"]>0)].head(30).reset_index()

Unnamed: 0,index,hd_index,olds_index,cap,Plant_name_1,Plant_name_2,fuel_type,Score,hd_name,olds_name
0,110,112,64,1.0,1.0,0.0,1.0,3.0,Fenchugonj 51 MW Rental (15 Yrs) (Barakatullah),Fenchuganj 15 Years RPP (Barakatullah)
1,145,145,17,1.0,1.0,0.0,1.0,3.0,Rangpur 20 MW /GT,Saidpur 20 MW PP
2,56,36,49,1.0,1.0,0.0,1.0,3.0,Sahzibazar 86 MW RPP (15 yrs),Shahjibazar 86 MW PP (15 Yrs RPP)
3,101,75,26,1.0,1.0,0.0,1.0,3.0,Daudkandi 200MW(Bangla Trac),Doudkandi 200 MW PP (Bangla Trac)
4,128,126,3,1.0,1.0,0.0,1.0,3.0,Sylhet 1x20 MW /GT,Sylhet 20 MW PP
5,126,125,17,1.0,1.0,0.0,1.0,3.0,Saidpur 20 MW /GT,Saidpur 20 MW PP
6,125,125,16,1.0,1.0,0.0,1.0,3.0,Saidpur 20 MW /GT,Rangpur 20 MW PP
7,15,11,56,1.0,1.0,0.0,1.0,3.0,Barabkundu SIPP 22 MW (Regent Power),Barobkundo SIPP (Regent Power)
8,143,144,61,1.0,1.0,0.0,1.0,3.0,Tongi 80 MW GTPP,Tongi 105 MW PP
9,144,145,16,1.0,1.0,0.0,1.0,3.0,Rangpur 20 MW /GT,Rangpur 20 MW PP


In [131]:
round(225, -1)

220

# LLm matching

In [65]:
joined_dates = {}
for i, val in olds.iterrows():
    joined_dates[val['plant_name']] = ", ".join([str(k)+str(' : ')+str(v) for k,v in dict(val.iloc[:-2]).items()])

In [67]:
joined_names = {}
for i, val in hd.iloc[:,:].iterrows():
    joined_names[val['plant_name']] = ", ".join([str(k)+str(' : ')+str(v) for k,v in dict(val).items()])

In [74]:
names_matchlist = pd.DataFrame.from_dict(joined_names, orient="index")
dates_matchlist = pd.DataFrame.from_dict(joined_dates, orient="index")

In [75]:
names_matchlist.to_csv('names_matchlist.csv')
dates_matchlist.to_csv('dates_matchlist.csv')

In [128]:
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
openai.api_key = openai_api_key

def get_embeddings(input_series):
    input_values = list(input_series.values)
    input_keys = input_series.index

    resp = openai.Embedding.create(
        input=input_values,
        engine="text-similarity-davinci-001")

    embeds = {}
    for i in range(len(input_values)):
        embeds[input_keys[i]] = resp['data'][i]['embedding']
    return embeds

In [110]:
len(embeds[list(embeds.keys())[1]])

12288

In [113]:
source_embeds = get_embeddings(names_matchlist[0])

In [114]:
len(source_embeds[list(source_embeds.keys())[1]])

12288

In [120]:
def embeddings_data(source_embeds):
    xd = pd.DataFrame.from_dict(source_embeds, orient="index")
    final = pd.DataFrame(index=xd.index)
    final["embeddings"] = xd.values.tolist()
    return final

In [122]:
embeddings_data(source_embeds).to_csv('names_matchlist_embeddings.csv')
embeddings_data(embeds).to_csv('dates_matchlist_embeddings.csv')

In [123]:
names_embeds = embeddings_data(source_embeds)
dates_embeds = embeddings_data(embeds)

In [125]:
dates_embeds.head(1)

Unnamed: 0,embeddings
Ashuganj 150 MW ST3,"[-0.008725064806640148, -0.0019621870014816523..."


In [154]:
# search through the reviews for a specific product
def search_embeddings(target_df, source_embedding, n=3):
    # source_embedding = np.transpose(np.array(source_embedding_ser.values))
    target_df["similarities"] = target_df["embeddings"].apply(lambda x: cosine_similarity(np.array(x), source_embedding))

    top_n = target_df.sort_values("similarities", ascending=False).head(n)
    # res = top_n.combined.str.replace("Title: ", "").str.replace("; Content:", ": ")
    return top_n


In [169]:
for i in range(len(names_embeds)):
    res = search_embeddings(dates_embeds, names_embeds["embeddings"][i], n=1)
    if res.similarities[0]>0.9:
        print(names_embeds.index[i],"----MATCH---", res.index[0], res.similarities[0])

Amnura 50MW Sinha Power ----MATCH--- Katakhali 50 MW Q. Rental PP( NPSL) 0.9088233786809273
Ashuganj 50 MW PP ----MATCH--- Ashuganj 55 MW 3 Yrs RPP (Precision Energy) 0.9146701154153333
Barabkundu SIPP 22 MW (Regent Power) ----MATCH--- Barobkundo SIPP (Regent Power) 0.9067138943644909
Baghabari 71 MW /GT
Baghabari 100 MW /GT ----MATCH--- Baghabari 71 MW GT 0.9020315836713727
210 MW Shiddirganj TPP ----MATCH--- Ghorasal 210 MW STunit4 0.9103503305660239
Midland Power Co. Ashuganj 51 MW ----MATCH--- Ashuganj 55 MW 3 Yrs RPP (Precision Energy) 0.9005779636673613
Ashugonj  55 MW 3Yrs Rental (Precision Energy) ----MATCH--- Ashuganj 55 MW 3 Yrs RPP (Precision Energy) 0.9272824160491357
Aggreko 95 MW Bhola ----MATCH--- Bhola 80 MW Q. Rental PP (3 Years, Aggreco) 0.9118009816872104
Siddhirganj 2x120 MW GTPP ----MATCH--- Raojan 210 MW Unit- 2 0.9064332209365884
Chandpur 150 MW CCPP ----MATCH--- Ashuganj 150 MW ST3 0.9107873521160925
Sirajgonj 210 MW CC (NWPGCL) Unit-1 ----MATCH--- Raozan 210 MW

In [14]:
df = pd.read_csv('gpt_match_test.csv')
len(df)

154

In [86]:
# Your user input

user_input = "air LineZ, airline, aviation, planes that fly, farming, bread, wifi networks, twitter media agency"

_input = prompt.format_prompt(user_names=names_matchlist[0].values[:10], target_names=dates_matchlist[0].values)


print (f"There are {len(_input.messages)} message(s)")
print (f"Type: {type(_input.messages[0])}")
print ("---------------------------")
#print (_input.messages[0].content)

There are 1 message(s)
Type: <class 'langchain.schema.messages.HumanMessage'>
---------------------------


In [87]:
output = chat_model(_input.to_messages())

In [88]:
print (type(output))
print (output.content)

<class 'langchain.schema.messages.AIMessage'>
```json
[
	{
		"input_names": "own : Public, plant_name : Kutubdia 900KW Wind PP, fuel_type : Wind, capacity_mw : 0.0, plant_name_alt : Wind Base Power Station, Kutubdia & Hatiya",
		"matching_name": "plant_name : Ashuganj 150 MW ST3, capacity : 150.0, own : Public, fuel_type : Gas",
		"match_score": "20"
	},
	{
		"input_names": "own : IPP, plant_name : Sharishabari 3 MW Engreen Solar Power Plant, fuel_type : Solar, capacity_mw : 3.0, plant_name_alt : Engreen Solar",
		"matching_name": "plant_name : Sylhet 20 MW PP, capacity : 20.0, own : Public, fuel_type : Gas",
		"match_score": "10"
	},
	{
		"input_names": "own : IPP, plant_name : Sympa Solar Power 8 MW, fuel_type : Solar, capacity_mw : 8.0, plant_name_alt : Sympa Solar Power Limited",
		"matching_name": "plant_name : Sylhet 20 MW PP, capacity : 20.0, own : Public, fuel_type : Gas",
		"match_score": "10"
	},
	{
		"input_names": "own : Public, plant_name : Kaptai Solar, fuel_type : Solar,

In [20]:
if "```json" in output.content:
    json_string = output.content.split("```json")[1].strip()
else:
    json_string = output.content

In [21]:
print(output.content)

```json
[
	{
		"input_names": "['Wind Base Power Station, Kutubdia & Hatiya', 'Kutubdia 900KW Wind PP']",
		"matching_name": "Shikalbaha 150 MW Power Plant",
		"match_score": "80"
	},
	{
		"input_names": "['Engreen Solar', 'Sharishabari 3 MW Engreen Solar Power Plant']",
		"matching_name": "Shorishabari Solar plant",
		"match_score": "90"
	},
	{
		"input_names": "['Sympa Solar Power Limited', 'Sympa Solar Power 8 MW']",
		"matching_name": "Shorishabari Solar plant",
		"match_score": "70"
	},
	{
		"input_names": "['7.4 MW Solar Pv Power Plant At Kaptai', 'Kaptai Solar']",
		"matching_name": "Kaptai Solar Power Plant",
		"match_score": "100"
	},
	{
		"input_names": "['ENERGYPRIMA, FENCHUGONJ', 'Fenchugonj 50 MW Rental (Energy Prima)']",
		"matching_name": "Fenchuganj 90 MW CCPP",
		"match_score": "80"
	},
	{
		"input_names": "['DESH CAMBRIDGE, KUMERGOAN', 'Kumargao 10 MW Desh Combridge (15 Yrs)']",
		"matching_name": "Kumargao 10 MW Desh Combridge (15 Yrs)",
		"match_score": "100"
	},
	{

In [22]:
# output_parser.parse(output.content) Ideally this works but not in all cases
structured_data = json.loads(output.content)
structured_data

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [41]:
pd.DataFrame(structured_data)

Unnamed: 0,input_industry,standarized_industry,match_score
0,air LineZ,Transportation & Logistics,80
1,airline,Transportation & Logistics,90
2,aviation,Transportation & Logistics,95
3,planes that fly,Transportation & Logistics,85
4,farming,Agriculture,90
5,bread,Consumer Goods,80
6,wifi networks,Hardware & Networking,95
7,twitter media agency,Media & Communications,90


#### To Do
1. Look at new incoming industries from the user
2. Match against your data base of values you've already mapped
3. For existing ones, save an API call and get the result from the data base
4. For new ones, batch them together for your LLM to return back to you