In [109]:
import dlt
from dlt.sources.helpers import requests
import os
from dotenv import load_dotenv
import datetime
import duckdb
from preprocess_raw_html import preprocess_raw_html # created function
from openai import AzureOpenAI
import time
import pandas as pd
import jsonlines
import json
import re
import matplotlib.pyplot as plt
from matplotlib_venn import venn3,venn2,venn2_circles
import pyarrow as pa
import pyarrow.parquet as pq

load_dotenv(override=True)
pd.set_option('display.max_columns', None) 
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)

### Get the Philosophy Questions Stored in DuckDB

In [56]:
pipeline = dlt.pipeline(
    pipeline_name="philosophy_questions_incremental",
    destination="duckdb",
    dataset_name="philosophy_questions",
)

conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")

conn.sql(f"SET search_path = '{pipeline.dataset_name}'")
print('Loaded tables: ')
display(conn.sql("show tables"))

Loaded tables: 


┌─────────────────────────────────────────────────────────────────────────────┐
│                                    name                                     │
│                                   varchar                                   │
├─────────────────────────────────────────────────────────────────────────────┤
│ _dlt_loads                                                                  │
│ _dlt_pipeline_state                                                         │
│ _dlt_version                                                                │
│ philosophy_questions                                                        │
│ philosophy_questions__items                                                 │
│ philosophy_questions__items__migrated_from__other_site__aliases             │
│ philosophy_questions__items__migrated_from__other_site__markdown_extensions │
│ philosophy_questions__items__migrated_from__other_site__related_sites       │
│ philosophy_questions__items__tags     

In [57]:
questions = conn.sql("SELECT * FROM philosophy_questions__items").df()
questions.head()

Unnamed: 0,owner__account_id,owner__reputation,owner__user_id,owner__user_type,owner__profile_image,owner__display_name,owner__link,is_answered,view_count,answer_count,score,last_activity_date,creation_date,last_edit_date,question_id,content_license,link,title,body,_dlt_parent_id,_dlt_list_idx,_dlt_id,closed_date,closed_reason,owner__accept_rate,accepted_answer_id,locked_date,protected_date,migrated_from__other_site__styling__tag_background_color,migrated_from__other_site__styling__tag_foreground_color,migrated_from__other_site__styling__link_color,migrated_from__other_site__launch_date,migrated_from__other_site__open_beta_date,migrated_from__other_site__closed_beta_date,migrated_from__other_site__site_state,migrated_from__other_site__high_resolution_icon_url,migrated_from__other_site__favicon_url,migrated_from__other_site__icon_url,migrated_from__other_site__audience,migrated_from__other_site__site_url,migrated_from__other_site__api_site_parameter,migrated_from__other_site__logo_url,migrated_from__other_site__name,migrated_from__other_site__site_type,migrated_from__on_date,migrated_from__question_id,community_owned_date
0,16491829.0,18695.0,40843.0,registered,https://graph.facebook.com/10157462114367910/p...,Kristian Berry,https://philosophy.stackexchange.com/users/408...,True,23,1,0,1725632518,1725626518,1725633000.0,116884,CC BY-SA 4.0,https://philosophy.stackexchange.com/questions...,Is a stable quantifier-free language really po...,<p>I'm reading the yesterday-updated SEP entry...,zcTBbU9rMWMC0Q,0,MsT4h//bUH+5vQ,,,,,,,,,,,,,,,,,,,,,,,,,
1,29736116.0,2914.0,68482.0,registered,https://i.sstatic.net/yOM8W.jpg?s=256,Meanach,https://philosophy.stackexchange.com/users/684...,True,103,4,1,1725631539,1725462078,,116834,CC BY-SA 4.0,https://philosophy.stackexchange.com/questions...,Churchill said that democracy is the worst for...,<p>I was intrigued by a recent question about ...,zcTBbU9rMWMC0Q,1,rrvOKbIJX0vckA,,,,,,,,,,,,,,,,,,,,,,,,,
2,5253885.0,335.0,47770.0,registered,https://www.gravatar.com/avatar/c2570aa785c372...,user,https://philosophy.stackexchange.com/users/477...,False,149,2,1,1725629098,1693675067,1700492000.0,102488,CC BY-SA 4.0,https://philosophy.stackexchange.com/questions...,"If someone lacks the will to live, can there s...","<p>After reading some posts such as <a href=""h...",zcTBbU9rMWMC0Q,2,B1GJvYnou9fkZA,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,does_not_exist,,user62907,,True,344,8,3,1725626506,1665130403,1725524000.0,94063,CC BY-SA 4.0,https://philosophy.stackexchange.com/questions...,Can we assign probabilities to God and is the ...,<p>Dawkins essentially argues that if one obse...,zcTBbU9rMWMC0Q,3,HUL5WSXsdzaGkQ,,,,,,,,,,,,,,,,,,,,,,,,,
4,19780752.0,1507.0,77058.0,registered,https://www.gravatar.com/avatar/dce3e1a595d736...,user77058,https://philosophy.stackexchange.com/users/770...,True,210,1,0,1725619849,1725455271,1725620000.0,116828,,https://philosophy.stackexchange.com/questions...,Can the strength of evidence for a proposition...,"<p>Questions such as <a href=""https://philosop...",zcTBbU9rMWMC0Q,4,Tn6khuidGV1ezw,1725523000.0,Duplicate,,,,,,,,,,,,,,,,,,,,,,,


In [58]:
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22748 entries, 0 to 22747
Data columns (total 47 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   owner__account_id                                         20153 non-null  float64
 1   owner__reputation                                         20153 non-null  float64
 2   owner__user_id                                            20153 non-null  float64
 3   owner__user_type                                          22748 non-null  object 
 4   owner__profile_image                                      20153 non-null  object 
 5   owner__display_name                                       22748 non-null  object 
 6   owner__link                                               20153 non-null  object 
 7   is_answered                                               22748 non-null  bool   
 8   view_count      

##### Answered Philosophy Questions

In [59]:
answered_questions = questions[questions["is_answered"]==True]

In [60]:
answered_questions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19285 entries, 0 to 22747
Data columns (total 47 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   owner__account_id                                         17080 non-null  float64
 1   owner__reputation                                         17080 non-null  float64
 2   owner__user_id                                            17080 non-null  float64
 3   owner__user_type                                          19285 non-null  object 
 4   owner__profile_image                                      17080 non-null  object 
 5   owner__display_name                                       19285 non-null  object 
 6   owner__link                                               17080 non-null  object 
 7   is_answered                                               19285 non-null  bool   
 8   view_count           

In [61]:
# sort the data frame by creation_date ascending
asked_questions = answered_questions[::-1]

In [62]:
# get the questions that are passed to the GPT35 model 
asked_questions = asked_questions[:2000]

In [63]:
# read the log for the successful API response
gpt_answered_questions = []
with open('openai-gpt35-0125-log.txt', 'r') as f:
    for line in f:
        idx,status = line.strip().split("-")
        if status=="success":
            gpt_answered_questions.append(int(idx))

# gpt answered questions 
gpt_answered_questions = asked_questions.iloc[gpt_answered_questions]

In [64]:
print(preprocess_raw_html(asked_questions['body'].iloc[394]))

I was reading Wikipedia through materialism and physicalism article and was unable to find, whether modern scientists largely subscribe to physicalism in natural sciences. What is the scientific consensus on the metaphysical stances? To particularize: Does physicalism holds for mathematics and what is the mathematicians' consensus? Do natural scientists subscribe to physicalism in their fields? Do social scientists subscribe to physicalism in their fields?


### Get the Azure AI GPT-35 (Model Version 0125) Response

In [65]:
gpt_generated_ans = []

with open("./gpt35_0125_philosophy_answers.jsonl", 'r') as f:
    for line in f:
        data = json.loads(line)
        if isinstance(data, str):
            data = json.loads(data)
        gpt_generated_ans.append(data)

In [66]:
len(gpt_generated_ans)

1623

In [67]:
gpt_generated_ans[0].keys()

dict_keys(['id', 'choices', 'created', 'model', 'object', 'service_tier', 'system_fingerprint', 'usage'])

In [68]:
gpt_generated_ans[0]

{'id': 'chatcmpl-AAacygC0xlQIHzdZOcGCcnjiaUTLP',
 'choices': [{'finish_reason': 'stop',
   'index': 0,
   'logprobs': None,
   'message': {'content': 'The instrumentalist view of the universe is a philosophical perspective that holds that scientific theories should be seen as tools or instruments for making predictions and explaining observations, rather than as providing a true representation of reality. According to instrumentalism, scientific theories are judged solely by their predictive power and ability to account for empirical data, without making claims about the underlying reality that they describe.\n\nThere are several significant philosophical consequences that can arise from adopting an instrumentalist view of the universe:\n\n1. Epistemic humility: Instrumentalism can lead to an acknowledgment of the limitations of human knowledge and understanding. By viewing scientific theories as useful tools rather than definitive descriptions of reality, instrumentalism emphasizes th

In [69]:
# arrange the order of questions to match the gpt-35 response
gpt_generated_ans = gpt_generated_ans[::-1]
gpt_generated_ans = gpt_generated_ans[:len(gpt_answered_questions)]
gpt_generated_ans = gpt_generated_ans[::-1]

In [70]:
# add the gpt answers as a new column 
df = gpt_answered_questions.assign(gpt35_0125_ans=gpt_generated_ans)

In [71]:
print(preprocess_raw_html(df["body"].iloc[10]),end="\n\n")
print(df["gpt35_0125_ans"].iloc[10]["choices"][0]["message"]["content"])

Possible Duplicate: What does “to cause” mean?  Take an example, It is reported that short-circuit caused fire in city bakery. In this case short-circuit and fire was in a causality, short-circuit is cause and fire is effect. But what is causality? Is cause a necessary or sufficient condition of effect? In this case it seems short-circuit is neither necessary nor sufficient condition of make a fire. But it seems also short-circuit is not entirely irrelevant to the fire. So my question is what is the proper definition of Causality?

Causality is a fundamental concept in philosophy and science that refers to the relationship between cause and effect. It is the relationship between an event (the cause) and a second event (the effect), where the second event is understood to be a consequence of the first. 

In your specific example, the short-circuit is identified as the cause of the fire. The short-circuit is not a necessary condition for starting a fire in general, as fires can start fro

### Get the Philosophy Answers Based on the Asked Questions

In [72]:
questions_id = list(df["question_id"])

In [73]:
@dlt.resource(table_name="philosophy_answers", write_disposition="append")
def get_philosophy_answers(
    question_id,
    site='philosophy',
    page=1,
    pagesize=100,
    tag=None, 
):
    
    url = "https://api.stackexchange.com/2.3/questions" + f"/{question_id}" + "/answers" 
    params = {
        "site": site,
        "page": page,
        "pagesize": pagesize,
        "key": os.getenv("SE_API_KEY"),
        "filter": "withbody"
        # "tagged": tag,  
    }

    while True:
        response = requests.get(url,params=params)
        response.raise_for_status()
        yield response.json()
        
        if response.json()["has_more"]==False:
            break
        
        params["page"]+=1


In [74]:
pipeline = dlt.pipeline(
    pipeline_name="get_philosophy_answers_incremental",
    destination="duckdb",
    dataset_name="philosophy_answers",
)

# for i in range(len(questions_id)):
#     load_info = pipeline.run(get_philosophy_answers(questions_id[i]))
#     row_counts = pipeline.last_trace.last_normalize_info
#     print(row_counts)
#     print("------")
#     print(load_info)

In [75]:
conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")

conn.sql(f"SET search_path = '{pipeline.dataset_name}'")
print('Loaded tables: ')
display(conn.sql("show tables"))

Loaded tables: 


┌───────────────────────────┐
│           name            │
│          varchar          │
├───────────────────────────┤
│ _dlt_loads                │
│ _dlt_pipeline_state       │
│ _dlt_version              │
│ philosophy_answers        │
│ philosophy_answers__items │
└───────────────────────────┘

In [76]:
answers = conn.sql("SELECT * FROM philosophy_answers__items").df()
answers.head()

Unnamed: 0,owner__account_id,owner__reputation,owner__user_id,owner__user_type,owner__accept_rate,owner__profile_image,owner__display_name,owner__link,is_accepted,score,last_activity_date,creation_date,answer_id,question_id,content_license,body,_dlt_parent_id,_dlt_list_idx,_dlt_id,last_edit_date,community_owned_date
0,89797.0,16136.0,233.0,registered,50.0,https://www.gravatar.com/avatar/3698349ad0b898...,Rex Kerr,https://philosophy.stackexchange.com/users/233...,True,9,1350840403,1350840403,4008,4007,CC BY-SA 3.0,<p>The <em>operational</em> consensus of physi...,15Ds9PswRVjGQA,0,oJ/3cLXokMvZYg,,
1,1866755.0,51.0,2417.0,unregistered,,https://www.gravatar.com/avatar/b8ef13d2f91f83...,Nigel Redding,https://philosophy.stackexchange.com/users/241...,False,2,1352158249,1352158249,4141,4140,CC BY-SA 3.0,<p>Aristotle discusses Ethics in his Nicomache...,S+voxsUTYselhg,0,RD2skrHW1LQiZQ,,
2,947951.0,405.0,2429.0,registered,,https://i.sstatic.net/CtXUo.jpg?s=256,Be Brave Be Like Ukraine,https://philosophy.stackexchange.com/users/242...,True,3,1352335268,1352335268,4151,4149,CC BY-SA 3.0,"<p><a href=""http://www.freedominion.ca/phpBB2/...",jEuFb4GgOuRecQ,0,hfvU+3DlBY1E1A,,
3,947951.0,405.0,2429.0,registered,,https://i.sstatic.net/CtXUo.jpg?s=256,Be Brave Be Like Ukraine,https://philosophy.stackexchange.com/users/242...,False,1,1351712605,1351712605,4107,4000,CC BY-SA 3.0,<pre><code>∀x ∈ Man : x ∈ People\n∃x ∈ Man : x...,u3aKRAX9UVRX5g,0,9tpjJW1GBIeJyA,,
4,23411.0,204.0,2571.0,registered,,https://www.gravatar.com/avatar/9a2f20624f0fcb...,Tyler,https://philosophy.stackexchange.com/users/257...,True,3,1350718288,1350718288,4001,4000,CC BY-SA 3.0,<pre><code>∀x (Man(x) → Person(x))\n∃x (Man(x)...,u3aKRAX9UVRX5g,1,3FsSJBuRIpnxhw,,


In [77]:
answers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2275 entries, 0 to 2274
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   owner__account_id     2147 non-null   float64
 1   owner__reputation     2147 non-null   float64
 2   owner__user_id        2147 non-null   float64
 3   owner__user_type      2275 non-null   object 
 4   owner__accept_rate    667 non-null    float64
 5   owner__profile_image  2147 non-null   object 
 6   owner__display_name   2275 non-null   object 
 7   owner__link           2147 non-null   object 
 8   is_accepted           2275 non-null   bool   
 9   score                 2275 non-null   int64  
 10  last_activity_date    2275 non-null   int64  
 11  creation_date         2275 non-null   int64  
 12  answer_id             2275 non-null   int64  
 13  question_id           2275 non-null   int64  
 14  content_license       2275 non-null   object 
 15  body                 