In [1]:
"""
This is a skeleton of Assignment 1 for DS5110/CS5501 Spring 2024 (UVA).
NOTE you will need to change the .ipynb file name by following the naming convention.
Code should be commented well. 
Feel free to import any missing packages or add extra cells as you see fit. 
"""
import dask
from dask.distributed import Client
import dask.dataframe as dd
import time
import json
import numpy as np

In [2]:
client = Client('172.31.4.232:8786')
client = client.restart()
print(client)

None


In [3]:
%%time

# You should have already extracted the two .CSV files from stackoverflow.zip
# Load the .CSV files into the Dask dataframes
questions = dd.read_csv('/home/ubuntu/questions.csv', sample=2**30, assume_missing=True)
question_tags = dd.read_csv('/home/ubuntu/question_tags.csv', sample=2**30, assume_missing=True)

CPU times: user 661 ms, sys: 2.49 s, total: 3.15 s
Wall time: 3.14 s


In [4]:
%%time
# Task 1: Get the percentage of missing values for all the columns in the questions table and the question_tags table.

# Compute and print missing value percentage for all columns of questions
print("Questions Missing Value %:\n", (questions.isnull().sum()/len(questions)*100).compute())

# Compute and print missing value percentage for all columns of question_tags
print("Question_Tags Missing Value %:\n", (question_tags.isnull().sum()/len(question_tags)*100).compute())

Questions Missing Value %:
 Id               0.000000
CreationDate     0.000000
ClosedDate      89.973578
DeletionDate    76.962907
Score            0.000000
OwnerUserId     23.920885
AnswerCount     10.114036
dtype: float64
Question_Tags Missing Value %:
 Id     0.000000
Tag    0.027501
dtype: float64
CPU times: user 91.8 ms, sys: 10.2 ms, total: 102 ms
Wall time: 44.4 s


In [5]:
%%time
# Task 2: Get mean, standard deviation, medium, min, and max of the Score column in the questions table.

print("Score Mean:", (questions["Score"].mean()).compute()) # Print mean
print("Score Standard Deviation:", (questions["Score"].std()).compute()) # Print standard deviation
print("Score Medium:", (questions["Score"].median_approximate()).compute()) # Print medium
print("Score Max:", (questions["Score"].max()).compute()) # Print max
print("Score Min:", (questions["Score"].min()).compute()) # Print min

Score Mean: 1.2041671665555287
Score Standard Deviation: 15.64573552737141
Score Medium: 1.0
Score Max: 16902.0
Score Min: -154.0
CPU times: user 164 ms, sys: 52.2 ms, total: 216 ms
Wall time: 52.8 s


In [6]:
%%time
# Task 3: Get the top 5 tags that have the highest number of questions.

# Group question_tags by tag, count and print 5 highest counts
print("Top 5 Tags by # of Questions:\n", question_tags.groupby("Tag")["Id"].count().compute().nlargest(5))

Top 5 Tags by # of Questions:
 Tag
javascript    1649631
java          1563820
php           1365600
c#            1321027
android       1288338
Name: Id, dtype: int64
CPU times: user 58.4 ms, sys: 18.3 ms, total: 76.8 ms
Wall time: 13.3 s


In [7]:
%%time
# Task 4: Check if there are any dangling references to the question Id field from the questions table to question_tags table. 
# Return 1 if there are dangling references; return 0 otherwise.

# Create left merge on id
question_merge = question_tags.merge(questions, on='Id', how='left', indicator=True)

# Print 1 if merge has rows with question_tags columns but not questions columns, 0 otherwise
print(1 if (len(question_merge[question_merge['_merge'] == 'left_only']) > 0) else 0)

0
CPU times: user 223 ms, sys: 6.56 ms, total: 229 ms
Wall time: 32.2 s


In [8]:
%%time
# Task 5: Create a new owner user table based on the questions table grouped by the OwnerUserId field.

# Create new owner user table based on questions grouped by OwnerUserId
question_owner_users = questions.groupby("OwnerUserId").agg( # ID of the question’s owner user
        AverageScore = ("Score", "mean"), # Average (mean) score across all questions posted by this user
        NumQuestions = ("Id", "count"), # Number of questions (count) posted by this user
        NumAnswers = ("AnswerCount", "sum") # Number of answers (sum) received by all the questions posted by this user
    )

# Print top 5 owner users who asked the most questions
print("Top 5 Owner Users by # of Questions:\n", question_owner_users.compute().nlargest(5, columns="NumQuestions"))

Top 5 Owner Users by # of Questions:
              AverageScore  NumQuestions  NumAnswers
OwnerUserId                                        
875317.0         1.198206          2230      3499.0
39677.0          6.607613          2128      5176.0
4653.0           6.883095          1822      5696.0
34537.0          5.213690          1680      4525.0
179736.0         7.344987          1516      3982.0
CPU times: user 192 ms, sys: 217 ms, total: 409 ms
Wall time: 15.2 s


In [9]:
%%time
# Task 6: Create a new table by merging the questions table and the question_tags table using Id as the index. 
# Then group the new table by the Tag field, with aggregated fields similar to Task 5.

# Merge questions and question_tags on Id
question_merge = questions.merge(question_tags, on='Id')

# Create new tags table based on merged table grouped by Tag
question_tags_detail = question_merge.groupby("Tag").agg( # ID of the question
        AverageScore = ("Score", "mean"), # Average (mean) score across all questions marked with this tag
        NumAnswers = ("AnswerCount", "sum"), # Number of answers (sum) received by all questions marked with this tag
        NumQuestions = ("Id", "count"), # Number of questions (count) marked with this tag
        NumOwners = ("OwnerUserId", "count") # Number of users (count) asking questions marked with this tag
    )
# Print top 5 tags with the highest number of questions
print("Top 5 Tags by # of Questions:\n", question_tags_detail.compute().nlargest(5, columns="NumQuestions"))

# Print top 5 tags with the highest number of answers
print("Top 5 Tags by # of Answers:\n", question_tags_detail.compute().nlargest(5, columns="NumAnswers"))

Top 5 Tags by # of Questions:
             AverageScore  NumAnswers  NumQuestions  NumOwners
Tag                                                          
javascript      1.134249   2252471.0       1649631    1296260
java            1.056542   2194354.0       1563820    1189238
php             0.375525   1855546.0       1365600    1017270
c#              1.340325   1937822.0       1321027    1039994
android         1.004450   1420669.0       1288338     936349
Top 5 Tags by # of Answers:
             AverageScore  NumAnswers  NumQuestions  NumOwners
Tag                                                          
javascript      1.134249   2252471.0       1649631    1296260
java            1.056542   2194354.0       1563820    1189238
c#              1.340325   1937822.0       1321027    1039994
php             0.375525   1855546.0       1365600    1017270
jquery          0.881797   1462029.0       1011324     801493
CPU times: user 308 ms, sys: 35.5 ms, total: 343 ms
Wall time: 1min 35s


In [10]:
%%time
# Task 7: kill the third and fourth worker and repeat Task 5 with two workers.

# Create new owner user table based on questions grouped by OwnerUserId
question_owner_users = questions.groupby("OwnerUserId").agg( # ID of the question’s owner user
        AverageScore = ("Score", "mean"), # Average (mean) score across all questions posted by this user
        NumQuestions = ("Id", "count"), # Number of questions (count) posted by this user
        NumAnswers = ("AnswerCount", "sum") # Number of answers (sum) received by all the questions posted by this user
    )

# Print top 5 owner users who asked the most questions
print("Top 5 Owner Users by # of Questions:\n", question_owner_users.compute().nlargest(5, columns="NumQuestions"))

Top 5 Owner Users by # of Questions:
              AverageScore  NumQuestions  NumAnswers
OwnerUserId                                        
875317.0         1.198206          2230      3499.0
39677.0          6.607613          2128      5176.0
4653.0           6.883095          1822      5696.0
34537.0          5.213690          1680      4525.0
179736.0         7.344987          1516      3982.0
CPU times: user 190 ms, sys: 177 ms, total: 367 ms
Wall time: 29.1 s


# Task 7 Report

**As we can see from the time report, Task 7 takes about twice as long to accomplish as Task 5. This is to be expected, as Task 7 was executed using half as many Dask workers (2) as Task 5 was (4). With only half as many workers dividing the processing required for the task, the amount of work required for each worker doubles, and so naturally the time to complete the work doubles too.**

In [11]:
%%time
# Task 8: kill the third and fourth worker and repeat Task 6 with two workers.

# Merge questions and question_tags on Id
question_merge = questions.merge(question_tags, on='Id')

# Create new tags table based on merged table grouped by Tag
question_tags_detail = question_merge.groupby("Tag").agg( # ID of the question
        AverageScore = ("Score", "mean"), # Average (mean) score across all questions marked with this tag
        NumAnswers = ("AnswerCount", "sum"), # Number of answers (sum) received by all questions marked with this tag
        NumQuestions = ("Id", "count"), # Number of questions (count) marked with this tag
        NumOwners = ("OwnerUserId", "count") # Number of users (count) asking questions marked with this tag
    )
# Print top 5 tags with the highest number of questions
print("Top 5 Tags by # of Questions:\n", question_tags_detail.compute().nlargest(5, columns="NumQuestions"))

# Print top 5 tags with the highest number of answers
print("Top 5 Tags by # of Answers:\n", question_tags_detail.compute().nlargest(5, columns="NumAnswers"))

Top 5 Tags by # of Questions:
             AverageScore  NumAnswers  NumQuestions  NumOwners
Tag                                                          
javascript      1.134249   2252471.0       1649631    1296260
java            1.056542   2194354.0       1563820    1189238
php             0.375525   1855546.0       1365600    1017270
c#              1.340325   1937822.0       1321027    1039994
android         1.004450   1420669.0       1288338     936349
Top 5 Tags by # of Answers:
             AverageScore  NumAnswers  NumQuestions  NumOwners
Tag                                                          
javascript      1.134249   2252471.0       1649631    1296260
java            1.056542   2194354.0       1563820    1189238
c#              1.340325   1937822.0       1321027    1039994
php             0.375525   1855546.0       1365600    1017270
jquery          0.881797   1462029.0       1011324     801493
CPU times: user 314 ms, sys: 18.2 ms, total: 333 ms
Wall time: 3min


# Task 8 Report

**As with Task 7 and Task 5, the time report indicates that Task 8 takes about twice as long to accomplish as Task 6, for the same reasons (only having half as many workers doubles the time).**