In [2]:
from google.cloud import bigquery as bq
from google.cloud import bigquery_storage as bq_storage
import plotly.express as px
import pandas as pd
import numpy as np

# create clients
client = bq.Client.from_service_account_json("key.json")
storage_client = bq_storage.BigQueryReadClient.from_service_account_json("key.json")

bq_dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")
bq_dataset = client.get_dataset(bq_dataset_ref)

In [6]:
#GATHER COUNTS

quest_query = '''
    select count(distinct questions.owner_user_id) as askers
    from bigquery-public-data.stackoverflow.posts_questions as questions
    left join `bigquery-public-data.stackoverflow.posts_answers` as answers
    on answers.owner_user_id = questions.owner_user_id
    where answers.owner_user_id is null
    '''

answer_query = '''
    select count(distinct answers.owner_user_id) as responder
    from bigquery-public-data.stackoverflow.posts_answers as answers
    left join bigquery-public-data.stackoverflow.posts_questions as questions
    on answers.owner_user_id = questions.owner_user_id
    where questions.owner_user_id is null
    '''

both_query = '''
    select count(distinct questions.owner_user_id) as responder
    from bigquery-public-data.stackoverflow.posts_answers as answers
    inner join bigquery-public-data.stackoverflow.posts_questions as questions
    on answers.owner_user_id = questions.owner_user_id
    '''

lazy_query = '''
    with posts as (
        select distinct owner_user_id from bigquery-public-data.stackoverflow.posts_answers
        union all
        select distinct owner_user_id from bigquery-public-data.stackoverflow.posts_questions
    )
    select count(user.id) as lazy
    from bigquery-public-data.stackoverflow.users as user 
    left join posts
    on posts.owner_user_id = user.id
    where posts.owner_user_id is null
    '''

askers = (
    client.query(quest_query)
    .result()
    .to_dataframe(bqstorage_client=storage_client)
    .iat[0,0]
)

responder = (
    client.query(answer_query)
    .result()
    .to_dataframe(bqstorage_client=storage_client)
    .iat[0,0]
)

lazy = (
    client.query(lazy_query)
    .result()
    .to_dataframe(bqstorage_client=storage_client)
    .iat[0,0]
)

both = (
    client.query(both_query)
    .result()
    .to_dataframe(bqstorage_client=storage_client)
    .iat[0,0]
)



In [30]:
total = askers + responder + lazy + both
ask_perc = askers / total * 100
resp_perc = responder / total * 100
both_perc = both / total * 100
lazy_perc = lazy / total * 100

ask_perc, resp_perc, both_perc, lazy_perc

fig = px.bar(
    x=["askers", "responders", "both", "lazy"], 
    y=[ask_perc, resp_perc, both_perc, lazy_perc],
    title="Users Categories on Stack Overflow",
    labels={"x":"User Categories", "y":"% of users"},
    category_orders={"x":["lazy", "askers",  "responders", "both"]},
    color=["#ef553b", "#636efa", "#ffa15a", "#00cc96"],
    color_discrete_map="identity"
)

fig.update_yaxes(tickprefix="%", showgrid=True)

fig.add_annotation(
    text=">60% inactive users!", x="lazy", y=65, arrowhead=1, showarrow=True
)

fig.show()

print('As the chart states, more than the 80% of the user does not contribute to the community.\n How can the community survive with such ratio of inactive users?')

As the chart states, more than the 80% of the user does not contribute to the community.
 How can the community survive with such ratio of inactive users?


In [37]:
## Number of answers posted for each active user

"""
Considering 'responders' and 'both' as active users, 
We can retrieve the mean number of answers that such users post to keep the communtity alive.
We espect the mean number of answers to be relatively high, to compensate inactive users presence.
"""

questions_query = '''
    select count(*) from bigquery-public-data.stackoverflow.posts_questions
'''

answers_query = '''
    select count(*) from bigquery-public-data.stackoverflow.posts_answers
'''

questions_num = (
    client.query(questions_query)
    .result()
    .to_dataframe(bqstorage_client=storage_client)
    .iat[0,0]
)

answers_num = (
    client.query(answers_query)
    .result()
    .to_dataframe(bqstorage_client=storage_client)
    .iat[0,0]
)

quest_per_inactive_user = questions_num / (askers+both)
answ_per_active_user = answers_num / (responder+both)

"""
Here is why. Users that are likely to answer questions tends to respond to a lot of questions, 
relatively to the number of questions made by each users, which is significantly low.
"""

print("Mean questions made by 'askers':", quest_per_inactive_user)
print("Mean answers made by 'answerers':", answ_per_active_user)


px.bar(y=[quest_per_inactive_user, answ_per_active_user], 
             x=["Questioners (N° of questions)","Contributors (N° of answers)"],
             title="How does stackoverflow survive?",
             labels={"x":"User type", "y":"Mean Posts"},
             color=['#ef553b','#636efa'],
             color_discrete_map="identity"
            )

Mean questions made by 'askers': 5.081663323674332
Mean answers made by 'answerers': 12.24306262216082
