-
Notifications
You must be signed in to change notification settings - Fork 0
/
langchain_llm_chain_extract.py
195 lines (171 loc) · 8.44 KB
/
langchain_llm_chain_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import re
from langchain import PromptTemplate, OpenAI, LLMChain
from langchain.chat_models import ChatOpenAI
import requests
from xml.etree import ElementTree
import sys
import os
import pandas as pd
from collections import Counter
from pathlib import Path
model = 'gpt-3.5-turbo'
accepted_categories = ['politics', 'environment', 'society', 'sports', 'lifestyle', 'technology', 'arts']
target_folder = Path('./llm_chain_out')
if not target_folder.exists():
target_folder.mkdir(parents=True)
def extract_rss(url):
"""
Extracts the content and title from a URL.
:param url The RSS feed URL, like e.g: http://www.theguardian.com/profile/georgemonbiot/rs
"""
response = requests.get(url)
tree = ElementTree.fromstring(response.content)
content = []
for child in tree:
if child.tag == 'channel':
for channel_child in child:
if channel_child.tag == 'item':
content.append({'content': channel_child[2].text, 'title': channel_child[0].text})
return content
def process_llm(input_list: list, prompt_template):
"""
Creates the LLMChain object using a specific model
:param input_list a list of dictionaries with the content and title of each article
:param prompt_template A single prompt template with content and title parameters
"""
llm = ChatOpenAI(temperature=0, model=model)
# llm = OpenAI(temperature=0, model='text-davinci-003')
llm_chain = LLMChain(
llm=llm,
prompt=PromptTemplate.from_template(prompt_template)
)
return llm_chain.apply(input_list)
def categorize_sentiment(text):
text = text.lower()
if 'very negative' in text:
return 'very negative'
elif 'negative' in text:
return 'negative'
elif 'very positive' in text:
return 'very positive'
elif 'positive' in text:
return 'positive'
return 'neutral'
def sanitize_categories(text):
text = text.lower()
sanitized = []
for cat in accepted_categories:
if cat in text:
sanitized.append(cat)
return sanitized
def sanitize_keywords(text):
text = text.lower()
text = text.replace("keywords:", "").strip()
sanitized = [re.sub(r"\.$", "", s.strip()) for s in text.split(",")]
return sanitized
prompt_templates = [("Please tell me the sentiment of {content} with this the title: {title}? Is it very positive, positive, very negative, negative or neutral? "
+ "Please answer using these expressions: 'very positive', 'positive', 'very negative', 'negative' or 'neutral'"),
"Please extract the most relevant keywords from {content} the title: {title}. Use a the prefix 'Keywords:' before the list of keywords.",
"Please categorize the following content using the following content {content} with title {title} using these categories: " + ",".join(accepted_categories)]
def serialize_results(url, result_df, title, sentiment_counter: Counter, categories_counter: Counter, keywords_counter: Counter):
"""
Converts the results to an Excel sheet or HTML page. The HTML page also contains the counter information.
:param url The RSS feed URL
:param result_df The combined raw data and with the LLM output
:param title The RSS feed URL with some modified characters
:param sentiment_counter The counter with the sentiment information
:param categories_counter The counter with the counted categories
"""
result_df.to_excel(target_folder/f"{title}.xlsx")
html_file = target_folder/f"{title}.html"
html_content = result_df.to_html(escape=False)
# Make sure the file is written in UTF-8
with open(html_file, "w", encoding="utf-8") as file:
file.write(html_content)
sentiment_html = generate_sentiment_table(sentiment_counter, "Sentiment")
categories_html = generate_sentiment_table(categories_counter, "Category")
keywords_html = generate_sentiment_table(dict(keywords_counter.most_common()[:10]), "Keywords")
with open(html_file, encoding="utf8") as f:
content = f"""<html>
<head>
<meta charset="UTF-8" />
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-9ndCyUaIbzAi2FUVXJi0CjmCapSmO7SnpJef0486qhLnuZ2cdeRhO02iuK6FUUVM" crossorigin="anonymous">
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js" integrity="sha384-geWF76RCwLtnZ8qwWowPQNguL3RmwHVBC9FhGdlKrxdiJJigb/j/68SIy3Te4Bkz" crossorigin="anonymous"></script>
</head>
<body>
<div class="container-fluid">
<h1>{re.sub(r'.+?theguardian.com/profile', '', url).replace("/rss", "").replace("/", "")}</h1>
<h3>Sentiment Count</h3>
{sentiment_html}
<h4>Categories Count</h4>
{categories_html}
<h4>Keywords Count</h4>
{keywords_html}
{f.read()}
</div>
</body>
</html>"""
content = content.replace('class="dataframe"', 'class="table table-striped table-hover dataframe"')
with open(html_file, "w", encoding="utf8") as f:
f.write(content)
def generate_sentiment_table(sentiment_counter, title):
sentiment_html = f"<table class='table table-hover'><tr><th>{title}</th><th>Count</th></tr>"
for s in sentiment_counter:
sentiment_html += f"<tr><td style='max-width: 200px; width: 100px'>{s}</td><td>{sentiment_counter[s]}</td></tr>"
sentiment_html += "</table>"
return sentiment_html
def process_url(url):
"""
Extracts the content of each RSS Feed.
Sends the content of each RSS feed to the LLMChain to apply the prompts to the extracted records.
Creates a data set for each RSS feed which combines the output of the LLM and generates an HTML and Excel file out of it.
:param url: the URL of the RSS feed, like e.g: http://www.theguardian.com/profile/georgemonbiot/rss
"""
print(f"Processing {url}")
zipped_results = []
llm_responses = []
input_list = extract_rss(url)
for prompt_template in prompt_templates:
llm_responses.append(process_llm(input_list, prompt_template))
sentiment_counter = Counter()
categories_counter = Counter()
keywords_counter = Counter()
for zipped in zip(input_list, *llm_responses):
sentiment = {'sentiment': zipped[1]['text']}
categorized_sentiment = categorize_sentiment(zipped[1]['text'])
sentiment_counter[categorized_sentiment] += 1
sentiment_category = {'sentiment_category': categorized_sentiment}
keywords = {'keywords': zipped[2]['text']}
raw_categories = zipped[3]['text']
classification = {'classification': raw_categories}
sanitized_topics = sanitize_categories(raw_categories)
categories_counter.update(sanitized_topics)
raw_keywords = zipped[2]['text']
keywords_counter.update(sanitize_keywords(raw_keywords))
sanitized_categories = {'topics': ",".join(sanitized_topics)}
full_record = {
**zipped[0],
**sentiment,
**keywords,
**sentiment_category,
**classification,
**sanitized_categories
}
zipped_results.append(full_record)
result_df = pd.DataFrame(zipped_results)
title = url.replace(":", "_").replace("/", "_")
serialize_results(url, result_df, title, sentiment_counter, categories_counter, keywords_counter)
if __name__ == "__main__":
# Example:
# python .\langchain_llm_chain_extract.py http://www.theguardian.com/profile/georgemonbiot/rss http://www.theguardian.com/profile/simonjenkins/rss
# http://www.theguardian.com/profile/zoewilliams/rss http://www.theguardian.com/profile/marinahyde/rss http://www.theguardian.com/profile/pollytoynbee/rss https://www.theguardian.com/profile/owen-jones/rss
# https://www.theguardian.com/profile/jonathanfreedland/rss https://www.theguardian.com/profile/johncrace/rss
# Configuration:
# Do not forget to set OPENAI_API_KEY in your environment
# os.environ["OPENAI_API_KEY"] = '<key>'
if len(sys.argv) == 1:
print("Please enter the URLs from which the titles are to be extracted.")
sys.exit()
input_list = []
for url in sys.argv[1:]:
process_url(url)