In [1]:
import re
import pandas as pd

def parse_multiline_logs(log_text: str) -> pd.DataFrame:
    log_entry_pattern = re.compile(
        r'^\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3})\] '
        r'\[(?P<module_info>.+?):(?P<line_number>\d+)\] '
        r'\[(?P<log_level>[A-Z]+)\] (?P<data>.*)', 
        re.MULTILINE
    )

    entries = []
    current_entry = None

    for line in log_text.splitlines():
        match = log_entry_pattern.match(line)
        if match:
            if current_entry:
                entries.append(current_entry)

            current_entry = match.groupdict()
            current_entry["data"] = current_entry["data"].strip()
        else:
            if current_entry:
                current_entry["data"] += "\n" + line.strip()

    if current_entry:
        entries.append(current_entry)

    for e in entries:
        if "." in e["module_info"]:
            e["file_path"] = e["module_info"]
            e["module_name"] = e["module_info"].split(".")[-1]
        else:
            e["file_path"] = e["module_info"]
            e["module_name"] = e["module_info"]
        del e["module_info"]

    df = pd.DataFrame(entries)
    return df[["timestamp", "file_path", "line_number", "module_name", "log_level", "data"]]


In [2]:
with open(r"../data/logs/2025-06-16-logs.log", "r", encoding="utf-8") as f:
    log_text = f.read()

df = parse_multiline_logs(log_text)
df

Unnamed: 0,timestamp,file_path,line_number,module_name,log_level,data
0,"2025-06-16 16:23:24,642",multi_agents.agents.orchestrator,96,orchestrator,INFO,Starting the research process for query 'Прове...
1,"2025-06-16 16:23:24,695",multi_agents.agents.researcher,71,researcher,INFO,Running initial research on the following quer...
2,"2025-06-16 16:23:24,824",gpt_researcher.agent,135,agent,INFO,"research: {""step"": ""start"", ""details"": {""query..."
3,"2025-06-16 16:23:24,825",gpt_researcher.agent,135,agent,INFO,"action: {""action"": ""choose_agent""}"
4,"2025-06-16 16:23:25,108",httpx,1740,httpx,INFO,HTTP Request: POST https://gigachat.sberdevice...
...,...,...,...,...,...,...
44321,"2025-06-16 20:00:57,775",fontTools.subset.timer,357,timer,DEBUG,Took 0.006s to prune 'name'
44322,"2025-06-16 20:00:57,776",fontTools.subset,3600,subset,INFO,name pruned
44323,"2025-06-16 20:00:57,840",multi_agents.agents.utils.file_formats,63,file_formats,INFO,Report written to ./outputs/run_1750092890/6d7...
44324,"2025-06-16 20:00:58,231",multi_agents.agents.utils.file_formats,98,file_formats,INFO,Report written to ./outputs/run_1750092890/12c...


In [3]:
df["log_level"].value_counts(dropna=False)

log_level
INFO       37612
DEBUG       4348
ERROR       1982
Name: count, dtype: int64

## Errors

In [4]:
errors_df = df.loc[df["log_level"] == "ERROR"]
errors_df

Unnamed: 0,timestamp,file_path,line_number,module_name,log_level,data
43,"2025-06-16 16:23:42,074",gpt_researcher.skills.researcher,324,researcher,ERROR,🤷 No content found for 'Проведи исследование и...
88,"2025-06-16 16:23:47,656",gpt_researcher.scraper.pymupdf.pymupdf,79,pymupdf,ERROR,Error loading PDF : https://veorus.ru/upload/i...
354,"2025-06-16 16:27:47,918",gpt_researcher.actions.agent_creator,60,agent_creator,ERROR,Error in choose_agent: JSONDecodeError('Expect...
355,"2025-06-16 16:27:47,918",gpt_researcher.actions.agent_creator,70,agent_creator,ERROR,Error in reading JSON and failed to repair wit...
356,"2025-06-16 16:27:47,918",gpt_researcher.actions.agent_creator,82,agent_creator,ERROR,No JSON found in the string. Falling back to D...
...,...,...,...,...,...,...
43403,"2025-06-16 19:57:12,394",gpt_researcher.scraper.beautiful_soup.beautifu...,44,beautiful_soup,ERROR,Error! : HTTPSConnectionPool(host='www.gosuslu...
43405,"2025-06-16 19:57:12,546",gpt_researcher.scraper.beautiful_soup.beautifu...,44,beautiful_soup,ERROR,Error! : HTTPSConnectionPool(host='tusp24.msp....
43429,"2025-06-16 19:57:13,720",gpt_researcher.scraper.beautiful_soup.beautifu...,44,beautiful_soup,ERROR,Error! : HTTPSConnectionPool(host='ir.alfastra...
44055,"2025-06-16 20:00:55,575",weasyprint,419,weasyprint,ERROR,No anchor ##ИскусственныйИнтеллектиАвтоматизац...


In [5]:
errors_df["file_path"].value_counts()

file_path
gpt_researcher.scraper.utils                             1598
gpt_researcher.scraper.beautiful_soup.beautiful_soup      135
gpt_researcher.actions.agent_creator                       91
weasyprint                                                 51
gpt_researcher.scraper.pymupdf.pymupdf                     46
gpt_researcher.skills.researcher                           44
gpt_researcher.retrievers.search_plugin.search_plugin      10
gpt_researcher.utils.llm                                    7
Name: count, dtype: int64

In [6]:
errors_df[["file_path", "line_number"]].value_counts()

file_path                                              line_number
gpt_researcher.scraper.utils                           72             1598
gpt_researcher.scraper.beautiful_soup.beautiful_soup   44              135
weasyprint                                             419              51
gpt_researcher.actions.agent_creator                   60               34
                                                       70               31
gpt_researcher.skills.researcher                       324              30
gpt_researcher.scraper.pymupdf.pymupdf                 79               27
gpt_researcher.actions.agent_creator                   82               25
gpt_researcher.scraper.pymupdf.pymupdf                 76               19
gpt_researcher.skills.researcher                       327              14
gpt_researcher.retrievers.search_plugin.search_plugin  88               10
gpt_researcher.utils.llm                               105               7
gpt_researcher.actions.agent_crea

### utils 72

In [7]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.scraper.utils') & (errors_df['line_number'] == '72'))]['data'].unique()

array(["Error parsing dimension value 100%: invalid literal for int() with base 10: '100%'",
       "Error parsing dimension value inherit: invalid literal for int() with base 10: 'inherit'",
       "Error parsing dimension value auto: invalid literal for int() with base 10: 'auto'",
       "Error parsing dimension value 80%: invalid literal for int() with base 10: '80%'",
       "Error parsing dimension value 90%: invalid literal for int() with base 10: '90%'",
       "Error parsing dimension value alt=: invalid literal for int() with base 10: 'alt='",
       "Error parsing dimension value 300%: invalid literal for int() with base 10: '300%'",
       "Error parsing dimension value 50%: invalid literal for int() with base 10: '50%'"],
      dtype=object)

### beautiful_soup 44

In [8]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.scraper.beautiful_soup.beautiful_soup') & (errors_df['line_number'] == '44'))]['data'].unique()

array(["Error! : HTTPSConnectionPool(host='www.dp.ru', port=443): Read timed out. (read timeout=4)",
       "Error! : HTTPSConnectionPool(host='kafanews.com', port=443): Read timed out. (read timeout=4)",
       'Error! : HTTPSConnectionPool(host=\'vestnik-rm.ru\', port=443): Max retries exceeded with url: /articles/kalejdoskop/german-gref-istoriya-uspeha (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002B382BC0C50>: Failed to resolve \'vestnik-rm.ru\' ([Errno 11001] getaddrinfo failed)"))',
       "Error! : HTTPSConnectionPool(host='vesti42.ru', port=443): Read timed out. (read timeout=4)",
       "Error! : HTTPSConnectionPool(host='www.interfax-russia.ru', port=443): Max retries exceeded with url: /academia/ru/news/articles/15561/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))",
       "Error! : HTTPSConnectionPool(host='omskgazzet

### weasyprint 419

In [9]:
errors_df.loc[((errors_df['file_path'] == 'weasyprint') & (errors_df['line_number'] == '419'))]['data'].unique()

array(['No anchor #sectorexpansionanddecline for internal URI reference',
       'No anchor #short-medium-and-long-term-goals for internal URI reference',
       'No anchor #историко-демографические-факторы-влияния-на-рождаемость-в-послевоенной-россии-1950-1970 for internal URI reference',
       'No anchor #современные-тенденции-и-вызовы-рождаемости-в-руссии-2015-2025 for internal URI reference',
       'No anchor #сравнительный-анализ-факторов-влияющих-на-рождаемость-в-исследуемые-периоды for internal URI reference',
       'No anchor #evolution-of-war-and-conflict-nature for internal URI reference',
       'No anchor #main-scenarios-of-battle-development for internal URI reference',
       'No anchor #military-modernization-directions for internal URI reference',
       'No anchor #formation-and-development-of-groups for internal URI reference',
       'No anchor #leadership-and-group-structure for internal URI reference',
       'No anchor #communication-and-coherence-in-groups for

### gpt_researcher.actions.agent_creator 60

In [10]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.actions.agent_creator') & (errors_df['line_number'] == '60'))]['data'].unique()

array(["Error in choose_agent: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')"],
      dtype=object)

### gpt_researcher.actions.agent_creator 70

In [11]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.actions.agent_creator') & (errors_df['line_number'] == '70'))]['data'].unique()

array(["Error in reading JSON and failed to repair with json_repair: 'str' object has no attribute 'get'",
       "Error in reading JSON and failed to repair with json_repair: 'list' object has no attribute 'get'"],
      dtype=object)

### gpt_researcher.skills.researcher 324

In [12]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.skills.researcher') & (errors_df['line_number'] == '324'))]['data'].unique()

array(["🤷 No content found for 'Проведи исследование и мотивируй схожие черты и различия личностных черт и профессиональных качеств Шувалова Игоря Ивановича - руководителя ВЭБ и Грефа Германа Оскарвича - руководителя Сбера.'...",
       "🤷 No content found for 'Число рожденных близнецов в России за последние годы'...",
       "🤷 No content found for 'новый продукт компании Джона Айва IO описание функции'...",
       "🤷 No content found for 'конкурентоспособность нового гаджета IO от John Ive'...",
       "🤷 No content found for 'Потенциальные экономические сложности и риски в Казахстане 2025 год'...",
       "🤷 No content found for 'Рекомендации реформ Казахстан правительство 2025'...",
       "🤷 No content found for 'Основные экономические риски Казахстана 2025-2030'...",
       "🤷 No content found for 'Макроэкономическая ситуация в Казахстане июнь 2025'...",
       '🤷 No content found for \'нейросети "Software 2.0" сравнение с традиционным подходом\'...',
       "🤷 No content found f

### gpt_researcher.scraper.pymupdf.pymupdf 79

In [13]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.scraper.pymupdf.pymupdf') & (errors_df['line_number'] == '79'))]['data'].unique()

array(["Error loading PDF : https://veorus.ru/upload/iblock/85f/196_veor_new.pdf HTTPSConnectionPool(host='veorus.ru', port=443): Max retries exceeded with url: /upload/iblock/85f/196_veor_new.pdf (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))",
       'Error loading PDF : https://vestnik-bist.ru/wp-content/uploads/2023/07/VestnikBIST_2-2023_site.pdf 403 Client Error: Forbidden for url: https://vestnik-bist.ru/wp-content/uploads/2023/07/VestnikBIST_2-2023_site.pdf',
       "Error loading PDF : https://eabr.org/upload/iblock/c55/EDB_Macroreview_2025_2027_RU.pdf HTTPSConnectionPool(host='eabr.org', port=443): Max retries exceeded with url: /upload/iblock/c55/EDB_Macroreview_2025_2027_RU.pdf (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))",
       "Error l

### gpt_researcher.actions.agent_creator 82

In [14]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.actions.agent_creator') & (errors_df['line_number'] == '82'))]['data'].unique()

array(['No JSON found in the string. Falling back to Default Agent.'],
      dtype=object)

### gpt_researcher.scraper.pymupdf.pymupdf 76

In [15]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.scraper.pymupdf.pymupdf') & (errors_df['line_number'] == '76'))]['data'].unique()

array(['Download timed out. Please check the link : https://elib.pnzgu.ru/files/eb/doc/2HIKmJCDlo52.pdf',
       'Download timed out. Please check the link : http://www.psu.ru/files/docs/science/books/sborniki/Iskusstvennyj-intellekt-v-reshenii-aktualnyh-socialnyh-i-ekonomicheskih-problem-XXI-veka-ch-1.pdf',
       'Download timed out. Please check the link : https://digital.gov.ru/uploaded/files/primeryi-primeneniya-tehnologij-iskusstvennogo-intellekta.pdf',
       'Download timed out. Please check the link : http://static.government.ru/media/files/41d457592e04b76338b7.pdf',
       'Download timed out. Please check the link : https://www.economy.gov.ru/material/file/b028b88a60e6ddf67e9fe9c07c4951f0/prognoz_socialno_ekonomicheskogo_razvitiya_rf_2025-2027.pdf',
       'Download timed out. Please check the link : https://www.economy.gov.ru/material/file/4c631ab1a829dc1a17ab40db24d52d3a/strategiya_fonda_sodeystviya_innovaciyam_do_2024.pdf',
       'Download timed out. Please check the lin

### gpt_researcher.skills.researcher 327

In [16]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.skills.researcher') & (errors_df['line_number'] == '327'))]['data'].unique()

array(['Error processing sub-query Анализ тенденций изменения характера военных конфликтов 2025-2030: The read operation timed out\nTraceback (most recent call last):\nFile "C:\\Users\\Anastasia\\projects\\gpt-researcher-eval\\.venv\\Lib\\site-packages\\httpx\\_transports\\default.py", line 101, in map_httpcore_exceptions\nyield\nFile "C:\\Users\\Anastasia\\projects\\gpt-researcher-eval\\.venv\\Lib\\site-packages\\httpx\\_transports\\default.py", line 127, in __iter__\nfor part in self._httpcore_stream:\nFile "C:\\Users\\Anastasia\\projects\\gpt-researcher-eval\\.venv\\Lib\\site-packages\\httpcore\\_sync\\connection_pool.py", line 407, in __iter__\nraise exc from None\nFile "C:\\Users\\Anastasia\\projects\\gpt-researcher-eval\\.venv\\Lib\\site-packages\\httpcore\\_sync\\connection_pool.py", line 403, in __iter__\nfor part in self._stream:\nFile "C:\\Users\\Anastasia\\projects\\gpt-researcher-eval\\.venv\\Lib\\site-packages\\httpcore\\_sync\\http11.py", line 342, in __iter__\nraise exc\

### gpt_researcher.retrievers.search_plugin.search_plugin 88

In [17]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.retrievers.search_plugin.search_plugin') & (errors_df['line_number'] == '88'))]['data'].unique()

array(['HTTPStatusError("Server error \'502 Bad Gateway\' for url \'https://gigachat.dev.app.sberdevices.ru/retrieval_proxy\'\\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/502")'],
      dtype=object)

### gpt_researcher.utils.llm 105

In [18]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.utils.llm') & (errors_df['line_number'] == '105'))]['data'].unique()

array(['Attempt 1 failed:',
       'Attempt 1 failed: peer closed connection without sending complete message body (incomplete chunked read)',
       'Attempt 1 failed: Server disconnected without sending a response.'],
      dtype=object)

### gpt_researcher.actions.agent_creator 80

In [19]:
errors_df.loc[((errors_df['file_path'] == 'gpt_researcher.actions.agent_creator') & (errors_df['line_number'] == '80'))]['data'].unique()

array(['Error decoding JSON: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)'],
      dtype=object)

## Warnings

In [20]:
warnings_df = df.loc[df["log_level"] == "WARNING"]
warnings_df

Unnamed: 0,timestamp,file_path,line_number,module_name,log_level,data
72,"2025-06-16 16:23:47,053",gpt_researcher.scraper.scraper,117,scraper,WARNING,Content too short or empty for https://mgimo.r...
89,"2025-06-16 16:23:47,679",gpt_researcher.scraper.scraper,117,scraper,WARNING,Content too short or empty for https://veorus....
90,"2025-06-16 16:23:47,679",gpt_researcher.scraper.scraper,117,scraper,WARNING,Content too short or empty for https://xn--90a...
214,"2025-06-16 16:24:00,052",gpt_researcher.scraper.scraper,117,scraper,WARNING,Content too short or empty for https://www.hse...
223,"2025-06-16 16:24:08,136",gpt_researcher.scraper.scraper,117,scraper,WARNING,Content too short or empty for https://www.rsh...
...,...,...,...,...,...,...
43423,"2025-06-16 19:57:13,560",gpt_researcher.scraper.scraper,117,scraper,WARNING,Content too short or empty for https://www.edu...
43436,"2025-06-16 19:57:13,925",gpt_researcher.scraper.scraper,117,scraper,WARNING,Content too short or empty for https://www.gos...
43437,"2025-06-16 19:57:13,941",gpt_researcher.scraper.scraper,117,scraper,WARNING,Content too short or empty for https://tusp24....
43472,"2025-06-16 19:57:14,926",gpt_researcher.scraper.scraper,117,scraper,WARNING,Content too short or empty for https://ir.alfa...


In [21]:
warnings_df["file_path"].value_counts()

file_path
gpt_researcher.scraper.scraper             382
langchain_gigachat.chat_models.gigachat      2
Name: count, dtype: int64

In [22]:
warnings_df[["file_path", "line_number"]].value_counts()

file_path                                line_number
gpt_researcher.scraper.scraper           117            382
langchain_gigachat.chat_models.gigachat  518              2
Name: count, dtype: int64

### gpt_researcher.scraper.scraper 117

In [23]:
warnings_df.loc[((warnings_df['file_path'] == 'gpt_researcher.scraper.scraper') & (warnings_df['line_number'] == '117'))]['data'].unique()

array(['Content too short or empty for https://mgimo.ru/upload/2023/09/MJ_01_2023.pdf',
       'Content too short or empty for https://veorus.ru/upload/iblock/85f/196_veor_new.pdf',
       'Content too short or empty for https://xn--90ab5f.xn--p1ai/common/upload/files/veb/news/review/op20130401.pdf',
       'Content too short or empty for https://www.hse.ru/data/2025/02/13/1465535857/%D0%91%D0%9E%D0%92%2042%20%D0%98%D1%82%D0%BE%D0%B3%D0%BE%D0%B2%D1%8B%D0%B9.pdf',
       'Content too short or empty for https://www.rshu.ru/sveden/document/2024/prikaz_429_2023.pdf',
       'Content too short or empty for https://frankmedia.ru/8544',
       'Content too short or empty for https://interneturok.ru/lesson/istoriya/8-klass/rossiya-v-60-70-e-gg-aleksandr-ii/sotsialno-ekonomicheskoe-razvitie-rossii-v-1860-1870-e-gg',
       'Content too short or empty for https://mintrans.gov.ru/file/373254',
       'Content too short or empty for https://xn--90ab5f.xn--p1ai/common/upload/files/veb/news/review/2

### langchain_gigachat.chat_models.gigachat 518

In [24]:
warnings_df.loc[((warnings_df['file_path'] == 'langchain_gigachat.chat_models.gigachat') & (warnings_df['line_number'] == '518'))]['data'].unique()

array(['Giga generation stopped with reason: length'], dtype=object)