In [2]:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from scipy import optimize, stats
import pandas as pd
import sympy as smp
import requests
from io import StringIO
from IPython.display import display, Math, Markdown, HTML
pd.set_option('display.max_colwidth', None)

# Gather OP data related to Cleo

## Import Data on Users Cleo Answered to Using StackExchange API

In [2]:
cleo_json = requests.get("https://api.stackexchange.com/2.2/users/97378/answers?pagesize=100&site=mathematics")
cleo_answers = pd.read_json(StringIO(cleo_json.text))
cleo_answers = pd.json_normalize(cleo_answers.loc[:, "items"])
q_ids = ";".join([str(i) for i in cleo_answers["question_id"]])
questions = requests.get(f"https://api.stackexchange.com/2.2/questions/{q_ids}?pagesize=100&site=mathematics")
questions_t = pd.read_json(StringIO(questions.text))
df_questions = pd.json_normalize(questions_t.loc[:, "items"])

## Clean OP Data

In [22]:
df_OPall = df_questions[['owner.display_name', 'owner.reputation', 'owner.user_id', 'owner.link']].drop_duplicates()
df_OPall.sort_values('owner.reputation', ascending=False, inplace=True)
df_OPall.rename(columns={'owner.display_name': 'Username', 'owner.reputation': 'Reputation', 'owner.user_id': 'ID', 'owner.link': 'Profile Link'}, inplace=True, )
df_OPall = df_OPall.reset_index(drop=True)

cleo_list = [cleo_answers['owner.display_name'][0], cleo_answers['owner.reputation'][0], cleo_answers['owner.user_id'][0], cleo_answers['owner.link'][0]]
df_OPall.loc[len(df_OPall)] = cleo_list

# Import data function

In [5]:
def get_data_by_user_id(ID, type1, type2):
    data = []
    page = 1
    has_more = True

    while has_more:
        url = f"https://api.stackexchange.com/2.3/{type1}/{ID}/" + type2
        params = {
            "order": "desc",
            "sort": "activity",
            "site": "math",
            "page": page,
            "pagesize": 100 
        }
        response = requests.get(url, params=params)
        response = response.json()

        if "items" in response:
            data.extend(response["items"])
            has_more = response["has_more"]
            page += 1
        else:
            has_more = False

    return data

def get_data(ID, type1, type2):
    all_data = []
    for ID in ID:
        data_tmp = get_data_by_user_id(ID, type1, type2)
        all_data.extend(data_tmp)
    return all_data

# Questions and Answers

## OP Question Data

In [6]:
# Import data
OPquestions = get_data(df_OPall['ID'].tolist(), 'users', 'questions')
df_OPquestions = pd.DataFrame(OPquestions)

In [7]:
# Extract display_name and user_id from the owner column
df_OPquestions['owner_display_name'] = df_OPquestions['owner'].apply(lambda x: x.get('display_name') if isinstance(x, dict) else None)
df_OPquestions['owner_user_id'] = df_OPquestions['owner'].apply(lambda x: x.get('user_id') if isinstance(x, dict) else None)

# Drop the original 'owner' column if not needed
df_OPquestions = df_OPquestions.drop(columns=['owner'])
df_OPquestions = df_OPquestions[['owner_display_name', 'owner_user_id', 'title', 'creation_date', 'link']]
df_OPquestions = df_OPquestions.sort_values(['owner_display_name', 'creation_date'], ascending=[True, False])

# Rename columns
df_OPquestions.rename(columns={
    'owner_display_name': 'Username',
    'creation_date': 'Post Date',
    'owner_user_id': 'ID',
    'title': 'Post Title',
    'link': 'Post Link'
}, inplace=True)

# Convert Unix timestamps to dates
df_OPquestions['Post Date'] = pd.to_datetime(df_OPquestions['Post Date'], unit='s').dt.strftime('%d-%m-%Y %H:%M')

## OP Answer Data

In [10]:
# Importing Answer data
OPanswers = get_data(df_OPall['ID'].tolist(), 'users', 'answers')
df_OPanswers_tmp = pd.DataFrame(OPanswers)

In [17]:
# Extract display_name and user_id from the owner column
df_OPanswers = df_OPanswers_tmp
df_OPanswers['owner_display_name'] = df_OPanswers['owner'].apply(lambda x: x.get('display_name') if isinstance(x, dict) else None)
df_OPanswers['owner_user_id'] = df_OPanswers['owner'].apply(lambda x: x.get('user_id') if isinstance(x, dict) else None)

df_OPanswers = df_OPanswers[['owner_display_name', 'owner_user_id', 'answer_id', 'creation_date', 'question_id']]
df_OPanswers = df_OPanswers.sort_values(['owner_display_name', 'creation_date'], ascending=[True, False])

# Rename columns
df_OPanswers.rename(columns={
    'owner_display_name': 'Username',
    'creation_date': 'Post Date',
    'owner_user_id': 'ID',
}, inplace=True)

# Convert Unix timestamps to dates
df_OPanswers['Post Date'] = pd.to_datetime(df_OPanswers['Post Date'], unit='s').dt.strftime('%d-%m-%Y %H:%M')

## Cleo's Answers

In [23]:
# Create a new DataFrame to avoid modifying a slice of another DataFrame
df_CleoAnswers = df_questions[['title', 'owner.display_name', 'creation_date', 'question_id']].copy()

# Convert creation_date to desired format and rename the column
df_CleoAnswers['Answer Post Date (Cleo)'] = pd.to_datetime(df_CleoAnswers['creation_date'], unit='s').dt.strftime('%d-%m-%Y %H:%M')

# Drop the original creation_date column
df_CleoAnswers.drop(columns=['creation_date'], inplace=True)

# Sort values by question_id
df_CleoAnswers.sort_values('question_id', inplace=True)

# Add 'Question Post Date' column from df_OPanswers DataFrame
df_CleoAnswers['Question Post Date'] = df_OPanswers[df_OPanswers['Username'] == 'Cleo'].sort_values('question_id')['Post Date'].to_list()

df_CleoAnswers[['Question Post Date', 'Answer Post Date (Cleo)']] = df_CleoAnswers[['Answer Post Date (Cleo)', 'Question Post Date']].values
df_CleoAnswers['question_id'] = df_CleoAnswers['question_id'].apply(lambda x: 'https://math.stackexchange.com/questions/' + str(x))
df_CleoAnswers.rename(columns={'owner.display_name': 'Username'}, inplace=True)


# Make row with Cleo's interaction count to df_OPall
df_OPall = pd.merge(df_OPall, df_CleoAnswers.value_counts('Username'), on='Username')
df_OPall.rename(columns={'count': 'Times Cleo Answered'}, inplace=True)

In [24]:
df_OPall

Unnamed: 0,Username,Reputation,ID,Profile Link,Times Cleo Answered
0,Lucian,48582,93448,https://math.stackexchange.com/users/93448/lucian,1
1,Vladimir Reshetnikov,47382,19661,https://math.stackexchange.com/users/19661/vladimir-reshetnikov,4
2,Tunk-Fey,24979,123277,https://math.stackexchange.com/users/123277/tunk-fey,1
3,Anastasiya-Romanova 秀,19415,133248,https://math.stackexchange.com/users/133248/anastasiya-romanova-%e7%a7%80,1
4,Laila Podlesny,13245,76878,https://math.stackexchange.com/users/76878/laila-podlesny,6
5,user178256,5527,178256,https://math.stackexchange.com/users/178256/user178256,2
6,Oksana Gimmel,5342,75621,https://math.stackexchange.com/users/75621/oksana-gimmel,3
7,user1001001,5215,157130,https://math.stackexchange.com/users/157130/user1001001,1
8,Marty Colos,3330,77543,https://math.stackexchange.com/users/77543/marty-colos,2
9,Zakharia Stanley,2793,75613,https://math.stackexchange.com/users/75613/zakharia-stanley,1


# Exporting

In [25]:
df_OPanswers.to_csv('df_OPanswers.csv', index=False)
df_OPquestions.to_csv('df_OPquestions.csv', index=False)
df_OPall.to_csv('df_OPall.csv', index=False)
df_CleoAnswers.to_csv('df_CleoAnswers.csv', index=False)

## Cleo's Comments

In [4]:
cleo_json = requests.get("https://api.stackexchange.com/2.2/users/97378/comments?pagesize=100&site=mathematics")
cleo_answers = pd.read_json(StringIO(cleo_json.text))
cleo_answers = pd.json_normalize(cleo_answers.loc[:, "items"])

In [7]:
cleo_answers['comment_id'].to_list()

[1997423, 1922145, 1230173]

In [11]:
cleo_answers['post_id'].to_list()

[972413, 930011, 577107]