# Stackoverflow Dataset 
https://bigquery.cloud.google.com/table/bigquery-public-data:stackoverflow.posts_questions

# Problem 1
- Supposed we want to predict using the Stackoverflow dataset the probability a question will be acceptably answered. Without doing the ML, create a ML dataset (training, testing, validation) that we can later use for classification.

In [1]:
import google.datalab.bigquery as bq

In [None]:
#Bigquery query

# SELECT
#   id,
#   accepted_answer_id
# FROM
#   [bigquery-public-data:stackoverflow.posts_questions]
# WHERE
#   ABS(HASH(creation_date)) % 10 < 8

In [2]:
training_set = """
SELECT
  id,
  accepted_answer_id
FROM
  `bigquery-public-data.stackoverflow.posts_questions`
WHERE
   MOD(ABS(farm_fingerprint(CAST(creation_date AS STRING))), 10) < 8
"""

training_set = bq.Query(training_set).execute().result().to_dataframe()
training_set.head()

Unnamed: 0,id,accepted_answer_id
0,5269923,5269967.0
1,42635640,42697080.0
2,26900738,26900792.0
3,25045557,25049067.0
4,33221645,33222770.0


In [None]:
testing_set = """
SELECT
  id,
  accepted_answer_id
FROM
  `bigquery-public-data.stackoverflow.posts_questions`
WHERE
   MOD(ABS(farm_fingerprint(CAST(creation_date AS STRING))), 10) > 8
"""

testing_set = bq.Query(testing_set).execute().result().to_dataframe()
testing_set.head()

In [None]:
validation_set = """
SELECT
  id,
  accepted_answer_id
FROM
  `bigquery-public-data.stackoverflow.posts_questions`
WHERE
   MOD(ABS(farm_fingerprint(CAST(creation_date AS STRING))), 10) = 9
"""

validation_set = bq.Query(validation_set).execute().result().to_dataframe()
validation_set.head()

# Problem 2
- Say we now needed to predict whether a question will be acceptably answered within a 2 day window. How would you create the dataset for Classification? Write the query below.

In [63]:
training_set_2 = """
SELECT
  questions.id,
  questions.accepted_answer_id,
  questions.creation_date AS creation_date_questions,
  answers.creation_date AS creation_date_acc_answers,
  TIMESTAMP_DIFF(answers.creation_date, questions.creation_date, DAY) AS duration,
  if((TIMESTAMP_DIFF(answers.creation_date, questions.creation_date, DAY) <= 2), 1, 0)
  
FROM
  `bigquery-public-data.stackoverflow.posts_questions` AS questions LEFT JOIN 
  `bigquery-public-data.stackoverflow.posts_answers` AS answers
   ON questions.accepted_answer_id = answers.id
   
WHERE
   MOD(ABS(farm_fingerprint(CAST(questions.creation_date AS STRING))), 10) < 8
   
LIMIT 100
"""

training_set_2 = bq.Query(training_set_2).execute().result().to_dataframe()
training_set_2

Unnamed: 0,id,accepted_answer_id,creation_date_questions,creation_date_acc_answers,duration,f0_
0,20435990,,2013-12-07 00:32:14.723,NaT,,0
1,52110638,,2018-08-31 07:35:10.177,NaT,,0
2,39160834,,2016-08-26 07:35:21.020,NaT,,0
3,34955835,,2016-01-22 20:57:53.113,NaT,,0
4,26749501,,2014-11-05 04:02:51.187,NaT,,0
5,28411299,,2015-02-09 13:51:34.800,NaT,,0
6,18842261,,2013-09-17 05:57:51.540,NaT,,0
7,51189039,,2018-07-05 10:30:51.513,NaT,,0
8,38026044,,2016-06-25 07:18:10.670,NaT,,0
9,15849838,,2013-04-06 10:37:18.560,NaT,,0


In [64]:
testing_set_2 = """
SELECT
  questions.id,
  questions.accepted_answer_id,
  questions.creation_date AS creation_date_questions,
  answers.creation_date AS creation_date_acc_answers,
  TIMESTAMP_DIFF(answers.creation_date, questions.creation_date, DAY) AS duration,
  if((TIMESTAMP_DIFF(answers.creation_date, questions.creation_date, DAY) <= 2), 1, 0)
  
FROM
  `bigquery-public-data.stackoverflow.posts_questions` AS questions LEFT JOIN 
  `bigquery-public-data.stackoverflow.posts_answers` AS answers
   ON questions.accepted_answer_id = answers.id
   
WHERE
   MOD(ABS(farm_fingerprint(CAST(questions.creation_date AS STRING))), 10) > 8
   
LIMIT 100
"""

testing_set_2 = bq.Query(testing_set_2).execute().result().to_dataframe()
testing_set_2

Unnamed: 0,id,accepted_answer_id,creation_date_questions,creation_date_acc_answers,duration,f0_
0,27006973,,2014-11-19 00:43:26.317,NaT,,0
1,39128266,,2016-08-24 16:10:50.483,NaT,,0
2,47484070,,2017-11-25 08:39:41.293,NaT,,0
3,28671775,,2015-02-23 10:36:52.677,NaT,,0
4,9415715,,2012-02-23 15:14:32.910,NaT,,0
5,45232370,,2017-07-21 08:08:12.240,NaT,,0
6,11954396,,2012-08-14 14:22:11.803,NaT,,0
7,18944143,,2013-09-22 13:06:44.763,NaT,,0
8,49398074,,2018-03-21 04:14:10.750,NaT,,0
9,48979631,,2018-02-25 23:33:34.840,NaT,,0


In [65]:
validation_set_2 = """
SELECT
  questions.id,
  questions.accepted_answer_id,
  questions.creation_date AS creation_date_questions,
  answers.creation_date AS creation_date_acc_answers,
  TIMESTAMP_DIFF(answers.creation_date, questions.creation_date, DAY) AS duration,
  if((TIMESTAMP_DIFF(answers.creation_date, questions.creation_date, DAY) <= 2), 1, 0)
  
FROM
  `bigquery-public-data.stackoverflow.posts_questions` AS questions LEFT JOIN 
  `bigquery-public-data.stackoverflow.posts_answers` AS answers
   ON questions.accepted_answer_id = answers.id
   
WHERE
   MOD(ABS(farm_fingerprint(CAST(questions.creation_date AS STRING))), 10) = 9
   
LIMIT 100
"""

validation_set_2 = bq.Query(validation_set_2).execute().result().to_dataframe()
validation_set_2

Unnamed: 0,id,accepted_answer_id,creation_date_questions,creation_date_acc_answers,duration,f0_
0,48110945,,2018-01-05 09:47:39.930,NaT,,0
1,30105468,,2015-05-07 15:26:42.750,NaT,,0
2,35818446,,2016-03-05 19:00:23.423,NaT,,0
3,11449191,,2012-07-12 09:47:08.660,NaT,,0
4,25178576,,2014-08-07 09:14:03.667,NaT,,0
5,50867621,,2018-06-15 00:20:49.007,NaT,,0
6,36241822,,2016-03-26 22:52:33.423,NaT,,0
7,30654700,,2015-06-04 21:42:36.950,NaT,,0
8,49905748,,2018-04-18 17:33:02.490,NaT,,0
9,24829512,,2014-07-18 16:11:09.050,NaT,,0
