MathNation Data Exploration
===
Exploring a sample of anonymized MathNation data.

In [1]:
from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
data_dir = Path("../data")
assert data_dir.exists()

In [4]:
mn_discussion_filepath = data_dir / "raw" / "math_nation" / "mn_discussion_20230914.csv"
assert mn_discussion_filepath.exists()
mn_df = pd.read_csv(mn_discussion_filepath)
mn_df.shape

(152844, 9)

In [5]:
mn_df.sample(3)

Unnamed: 0,reply_id,reply_user,reply_content,reply_ts_created,post_id,subject_name,post_content,post_user,post_ts_created
89529,2468990,1412702,Examples,2018-03-13 17:18:45,2468982,Algebra 1,degree and terms of a polynomial?\n,7395540,2018-03-13 17:15:02
145191,974764,667659,MAFS Section 1 video 6,2015-10-13 21:25:03,974748,Algebra 1,Is there any videos on multiplying and dividin...,2252406,2015-10-13 21:23:07
62185,1172038,681091,"Jack, lets see! Lets multiply those to see if ...",2016-01-30 21:49:40,1172013,Algebra 1,Help! solve each equation and check for soluti...,3584741,2016-01-30 21:40:17


In [7]:
mn_df.groupby("post_id").subject_name.head(1).value_counts()

subject_name
Algebra 1         15943
Geometry            133
Pre-Algebra          24
8th Grade Math        1
6th Grade Math        1
Name: count, dtype: int64

In [8]:
# for row in mn_df[mn_df.subject_name == "Pre-Algebra"].drop_duplicates(subset="post_id", keep="first").itertuples():
#    print(row.post_content)

In [9]:
# manually extracted student queries that are relevant for testing
prealgebra_student_queries = [
    "What is the quotient rule??",
    "How do I multiply fractions???????",
    "How do you multiply fractions?!?!?",
]

In [9]:
first_reply_df = mn_df.sort_values(by=["post_id", "reply_ts_created"]).drop_duplicates(subset="post_id", keep="first")
is_extended_post = first_reply_df.reply_user == first_reply_df.post_user
is_extended_post.sum() / len(first_reply_df)

0.08228791454477705

In [10]:
first_reply_df[is_extended_post]

Unnamed: 0,reply_id,reply_user,reply_content,reply_ts_created,post_id,subject_name,post_content,post_user,post_ts_created
84643,1867,945440,someone please help\n,2013-10-22 23:35:31,1865,Algebra 1,,945440,2013-10-22 23:35:14
2633,2065,939329,Can somebody help me please,2013-10-23 01:08:46,2045,Algebra 1,"If 9a+6b+8c=−3 ,\n\nwhat is 54a+48c+36b?",939329,2013-10-23 01:03:49
50640,3277,572466,need help,2013-10-26 17:33:16,3276,Algebra 1,\ny + 2x = −1\n3y − x =,572466,2013-10-26 17:32:44
37363,3487,1032512,Any takers?,2013-10-28 01:49:26,3485,Algebra 1,Challenge problem! Suppose the polynomial:\n\n...,1032512,2013-10-28 01:27:00
6423,3733,524723,how would you solve this?\n,2013-10-28 22:52:32,3732,Algebra 1,,524723,2013-10-28 22:52:14
...,...,...,...,...,...,...,...,...,...
1252,3305954,4599973,*part,2021-09-11 21:17:51,3305953,Geometry,May someone please help me with let b and c,4599973,2021-09-11 21:17:40
26794,3306558,5197522,i think it is but i dont now\n,2021-09-13 20:59:33,3306554,Algebra 1,If the question has a&lt;25 is a&lt;=25 the sa...,5197522,2021-09-13 20:58:48
74748,3308394,5223632,here is the paper,2021-09-15 23:05:46,3308391,Algebra 1,for the first box I got x&gt;0 and x&lt;6,5223632,2021-09-15 23:05:04
36997,3312002,4519295,anybody?,2021-09-21 23:51:27,3311997,Algebra 1,help,4519295,2021-09-21 23:49:09


In [11]:
posts = []
for post_id, group in mn_df.sort_values(by=["post_id", "reply_ts_created"]).groupby("post_id"):
    post_content = str(group.iloc[0].post_content)
    post_user = group.iloc[0].post_user
    for row in group.itertuples():
        if row.reply_user != post_user:
            break  # reply from non-OP
        else:
            # continuation of the post in a reply
            post_content += "\n[Continued:] " + str(row.reply_content)
        prev_row = row
    posts.append(
        {
            "post_id": post_id,
            "post_user": post_user,
            "subject_name": group.iloc[0].subject_name,
            "post_ts_created": group.iloc[0].post_ts_created,
            "post_content": post_content,
        }
    )
len(posts)

16102

In [12]:
pd.DataFrame(posts).sample(3)

Unnamed: 0,post_id,post_user,subject_name,post_ts_created,post_content
3777,706733,1218779,Algebra 1,2015-04-09 21:51:42,what is 5349 to the third power equal???? HELP...
11824,2261468,3440812,Algebra 1,2017-10-31 13:47:47,=| =] =) =} ^o^ ^0^ ^@^ ^u^ all mojys are not ...
3957,739247,2647012,Algebra 1,2015-04-18 15:51:10,Melissa who's after ya?


In [44]:
pd.DataFrame(posts).to_csv(data_dir / "derived" / "mn_student_queries_raw3.csv")

Original sample used for annotation was this:

```
sdf = mn_df[mn_df.subject_name.isin(["Algebra 1", "Geometry"])]
sdf = sdf.drop_duplicates(subset="post_id", keep="first")
sdf[["post_id", "subject_name", "post_content", "post_user", "post_ts_created"]].to_csv(data_dir / "derived" / "mn_student_queries_raw.csv")
```

But I threw those annotations away.

### Loading annotated data

In [46]:
adf = pd.read_csv(data_dir / "derived" / "mn_student_queries_raw2_annotated.csv")
adf.shape

(553, 8)

In [47]:
adf.sample(n=3)

Unnamed: 0,index,post_id,post_user,subject_name,post_ts_created,post_content,is_respondable_query,notes
2,2,19856,848987,Algebra 1,2013-11-16 23:25:43,how do you a graph function rule?!,general,
462,462,3119582,3073615,Geometry,2020-03-31 19:38:40,is anybody there\n,,
405,405,2966682,8261996,Geometry,2019-09-11 21:16:05,Wouldn't it be the same thing??,,


In [48]:
adf.is_respondable_query.value_counts()

is_respondable_query
problem                     62
general                     52
confirm                      8
resource request             4
general,problem              3
advice                       2
wrong but don't know why     1
clarify question             1
stuck                        1
Name: count, dtype: int64

In [53]:
pd.crosstab(adf.subject_name, adf.is_respondable_query, margins=True).T.sort_values(by="All", ascending=False)

subject_name,6th Grade Math,Algebra 1,Geometry,Pre-Algebra,All
is_respondable_query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
All,1,111,19,3,134
problem,1,54,7,0,62
general,0,40,9,3,52
confirm,0,6,2,0,8
resource request,0,3,1,0,4
"general,problem",0,3,0,0,3
advice,0,2,0,0,2
clarify question,0,1,0,0,1
stuck,0,1,0,0,1
wrong but don't know why,0,1,0,0,1


In [57]:
general_df = adf[adf.is_respondable_query.map(lambda s: pd.notna(s) and "general" in s)][
    ["post_id", "subject_name", "post_content", "is_respondable_query"]
]
general_df.sample(n=3)

Unnamed: 0,post_id,subject_name,post_content,is_respondable_query
29,199138,Algebra 1,Can someone give me a factoring example I don...,general
138,1105388,Pre-Algebra,Hey guys! What is the quotient rule??,general
110,859909,Algebra 1,How do you find the domain and range that are ...,general


In [58]:
general_df.to_csv(data_dir / "derived" / "mn_general_student_queries.csv", index=False)

In [62]:
# load the mathnation query data
mn_general_student_queries_filepath = data_dir / "derived" / "mn_general_student_queries.csv"
query_df = pd.read_csv(mn_general_student_queries_filepath)
print(f"MathNation queries: {query_df.shape}")
query_df.sample(n=3)

MathNation queries: (55, 4)


Unnamed: 0,post_id,subject_name,post_content,is_respondable_query
38,2672704,Algebra 1,i have a problem is this equation linear?\n7x ...,"general,problem"
41,2690581,Algebra 1,How do you know if a number is a constant?,general
20,1790360,Algebra 1,What is vertex form and how do you solve for it?,general
