In [68]:
from google.cloud import bigquery as bq
from google.cloud import bigquery_storage as bq_storage
import plotly.express as px
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# create clients
client = bq.Client.from_service_account_json("key.json")
storage_client = bq_storage.BigQueryReadClient.from_service_account_json("key.json")

bq_dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")
bq_dataset = client.get_dataset(bq_dataset_ref)

In [23]:
sample_size = 6000000 #--> 100 k

# question_answer_view = '''
#                 SELECT id AS user_id, reputation, asked, answered
#                         FROM `bigquery-public-data.stackoverflow.users` users
#                         LEFT JOIN(
#                             SELECT owner_user_id AS user_id, COUNT(*) AS asked
#                             FROM `bigquery-public-data.stackoverflow.posts_questions`
#                             GROUP BY user_id
#                         ) questions ON users.id = questions.user_id
#                         LEFT JOIN(
#                             SELECT owner_user_id AS user_id, COUNT(*) AS answered
#                             FROM `bigquery-public-data.stackoverflow.posts_answers`
#                             GROUP BY user_id
#                         ) answers ON users.id = answers.user_id
#                 limit ''' + str(sample_size)


aggregate_query = '''
            SELECT
                reputation AS reputation,
                COUNT(*) AS users,
                SUM(asked) AS questions,
                SUM(answered) AS answers
            FROM(
                SELECT id AS user_id, reputation, asked, answered
                            FROM `bigquery-public-data.stackoverflow.users` users
                            LEFT JOIN(
                                SELECT owner_user_id AS user_id, COUNT(*) AS asked
                                FROM `bigquery-public-data.stackoverflow.posts_questions`
                                GROUP BY user_id
                            ) questions ON users.id = questions.user_id
                            LEFT JOIN(
                                SELECT owner_user_id AS user_id, COUNT(*) AS answered
                                FROM `bigquery-public-data.stackoverflow.posts_answers`
                                GROUP BY user_id
                            ) answers ON users.id = answers.user_id
            )
 group by reputation
 '''


aggregate = (
    client.query(aggregate_query)
    .result()
    .to_dataframe(bqstorage_client=storage_client)
)

aggregate.fillna(0)

Unnamed: 0,reputation,users,questions,answers
0,51,56579,116514.0,74880.0
1,769,331,5016.0,5067.0
2,126,3768,6586.0,20044.0
3,212,470,3092.0,4065.0
4,279,965,9681.0,5054.0
...,...,...,...,...
25291,4024,16,290.0,1005.0
25292,3568,16,280.0,1062.0
25293,4407,16,525.0,1428.0
25294,4241,16,750.0,1261.0


In [24]:

def reputationToCategory(x):
    if(x >= 1 and x <= 100):
        return "Usurpers"
    elif(x > 100 and x <= 1000):
        return "Slaves"
    elif(x > 1000 and x <= 10000):
        return "Lords"
    elif(x > 10000 and x <= 100000):
        return "Grandmasters"
    else: return "Gods"
        
    

aggregate['reputation'] = aggregate['reputation'].map(reputationToCategory)
aggregate

Unnamed: 0,reputation,users,questions,answers
0,Usurpers,56579,116514.0,74880.0
1,Slaves,331,5016.0,5067.0
2,Slaves,3768,6586.0,20044.0
3,Slaves,470,3092.0,4065.0
4,Slaves,965,9681.0,5054.0
...,...,...,...,...
25291,Lords,16,290.0,1005.0
25292,Lords,16,280.0,1062.0
25293,Lords,16,525.0,1428.0
25294,Lords,16,750.0,1261.0


In [91]:
df = aggregate.groupby(['reputation']).sum()

df.to_csv('reputation.csv')

In [40]:
df

Unnamed: 0_level_0,users,questions,answers
reputation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Gods,945,82720.0,3126525.0
Grandmasters,21352,1543117.0,8071486.0
Lords,197310,5279592.0,10234116.0
Slaves,813758,6212260.0,5907889.0
Usurpers,12533410,7027377.0,3047965.0


In [46]:
def normalizeUsersCount(x):
    totalUsers = df['users'].sum()
    return (x * 100) / totalUsers
def normalizeQuestionsCount(x):
    totalQuestions = df['questions'].sum()
    return (x * 100) / totalQuestions
def normalizeAnswersCount(x):
    totalAnswers = df['answers'].sum()
    return (x * 100) / totalAnswers

In [47]:
df['users'] = df['users'].map(normalizeUsersCount)
df['questions'] = df['questions'].map(normalizeQuestionsCount)
df['answers'] = df['answers'].map(normalizeAnswersCount)

df

Unnamed: 0_level_0,users,questions,answers
reputation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Gods,0.006966,0.410622,10.288689
Grandmasters,0.157384,7.660025,26.561442
Lords,1.454362,26.207866,33.67817
Slaves,5.998168,30.837625,19.441532
Usurpers,92.38312,34.883862,10.030166


In [90]:
# fig = px.pie(df, values="users", names=df.index, color_discrete_sequence=px.colors.sequential.RdBu)

fig = go.Figure(data=[go.Pie(labels=df.index, values=df["users"], pull=[0, 0, 0, 0, 0.35], 
                             title="Users of Stack Overflow based on reputation categories")])

fig.show()