# Universal Automated Tutor

## 0: Data Import + Cleaning

In [1]:
import pandas as pd
import numpy as np 
import random

In [44]:
# Read in the CSVs, select variables, combine
raw_data04 = pd.read_csv("2004-WPI-Assistments-Math.csv",low_memory=False)
raw_data056 = pd.read_csv("2005-06-WPI-Assistments-Math.csv",low_memory=False)
raw_data056["problem_id"] = raw_data056["problem_id"] + max(data.problem_id)
raw_data = pd.concat([raw_data056,raw_data04])
data = raw_data[["stud_id","duration","student_response_type","problem_id","step","attempt_num",
             "last_attempt","outcome","input","feedback"]]
data.reset_index(drop=True, inplace=True)

# Create binary variables for hint, correct, incorrect
data.loc[:,"hint"] = np.where(data["outcome"] == "HINT",1,0)
data.loc[:,"correct"] = np.where(data["outcome"] == "CORRECT",1,0)
data.loc[:,"incorrect"] = np.where(data["outcome"] == "INCORRECT",1,0)

# Filter + clean data
data = data[data["duration"] != '.']
data["duration"] = data["duration"].astype(np.float64)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


26460

## 1: Distribution Creation

In [3]:
# Docstring Example

''' Summary or Description of the Function

    Parameters:
    argument1 (int): Description of arg1

    Returns:
    int: Returning value
'''

' Summary or Description of the Function\n\n    Parameters:\n    argument1 (int): Description of arg1\n\n    Returns:\n    int: Returning value\n'

In [45]:
def chi2lower(x):
    ''' Calculates the chi-squared distribution lower bound

    Parameters:
    x (float64): A # of incorrect, # of hints, or a duration

    Returns:
    float64: Returns the lower bound of the chi-squared distribution
    '''
    return np.mean(x) - np.std(x)/4


def chi2upper(x):
    ''' Calculates the chi-squared distribution upper bound

    Parameters:
    x (float64): A # of incorrect, # of hints, or a duration

    Returns:
    float64: Returns the upper bound of the chi-squared distribution
    '''
    return np.mean(x) + np.std(x)/2

In [46]:
# group the problems and students together to get the sum of each variable for each question
df = data[["problem_id", "duration", "hint", "stud_id","incorrect"]].groupby(["problem_id","stud_id"]).sum()
problemDistsLower = df.groupby(["problem_id"]).agg(chi2lower)
problemDistsUpper = df.groupby(["problem_id"]).agg(chi2upper)
problemDistsLower.rename(columns={'duration': 'durationLower', 'hint': 'hintLower', 'incorrect': 'incorrectLower'}, inplace=True)
problemDistsUpper.rename(columns={'duration': 'durationUpper', 'hint': 'hintUpper', 'incorrect': 'incorrectUpper'}, inplace=True)
problemDists = pd.concat([problemDistsLower, problemDistsUpper], axis=1)

# Create given student's mean
studentEV = df.groupby(["stud_id"]).mean()

## 2: Student Class

In [49]:
class Student:
    
    
    def __init__(self, data):
        self.data = data
        self.means = 0
        self.problem = ""

        
    def updateStudent(self, lastQ):
        self.data = self.data.append(lastQ)
        self.means = self.data.mean()
        self.duration = self.means.duration
        self.incorrect = self.means.incorrect
        self.hint = self.means.hint
        
    def nextQ(self):
        # Determines a viable next question
        EVdur = self.duration
        EVinc = self.incorrect
        EVhint = self.hint
        viableProbs = problemDists[(EVdur < problemDists["durationUpper"])  & (EVhint < problemDists["hintUpper"]) & \
                                   (problemDists["incorrectLower"] > EVinc)]
        problemNum = random.choice(viableProbs.index.values.tolist())
        self.problem = df[df["problem_id"]==problemNum].iloc[0][0].split(':')[-1].split("?")[0] + "?"

## 3: Example Run Through

Pseudo Code
studentData = Pretest data
Pretest data will include entries with "step", "problem_id", "duration", "hint", "incorrect", "correct"
student = Student(studentData)

In [50]:
# Example
stud = "Stu_fe96fe63d83aa63c4ec667167fc7f1ce"
df = data[["step", "problem_id", "stud_id","duration", "hint", "incorrect", "correct"]]
newdf = df[df["stud_id"] == stud]
studentData = newdf.iloc[0:10,:]

studentData

Unnamed: 0,step,problem_id,stud_id,duration,hint,incorrect,correct
1266120,Step0:3761:Harry measured all but one angle of...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,90.0,1,0,0
1266121,Step1:3762:How many angles are there in a pent...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,37.0,0,1,0
1266122,Step1:3762:How many angles are there in a pent...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,4.0,0,0,1
1266123,Step0:3761:Harry measured all but one angle of...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,246.0,1,0,0
1266124,Step1:3762:How many angles are there in a pent...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,6.0,0,0,1
1266125,Step3:3764:What is the measure of the only ang...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,29.0,0,0,1
1266126,Step0:193:(0.2)(0.2)(0.2) is equal to which of...,193,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,37.0,0,0,1
1266127,Step0:113:Figure ABCD is translated so that th...,113,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,65.0,1,0,0
1266128,Step1:114:Look at the figure above. Let s find...,113,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,88.0,1,0,0
1266129,Step1:114:Look at the figure above. Let s find...,113,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,21.0,1,0,0


In [51]:
# Initializing a student
student = Student(studentData)

# questionData represents their next action from the GUI (we will be passed this)
questionData = newdf.iloc[11:13,:]

# If the person gets the problem right, then send the questionData to the student to update and choose next Q
if questionData.correct.isin([1]).any():
    student.updateStudent(questionData)
    student.nextQ()

# Question that is sent to the interface next:
print(student.problem)

In April of each year, a 26.2-mile race is held in Boston. The 100th Boston Marathon was run in 1996. In the table above are facts about the Marathon. Which is the best estimate of Moses Tanui's average speed for the entire marathon in miles pe SYSTEM?


# What is left to finish model?

- Need to find correct answers
- Need to find images
- Need to figure out how to get through steps
- Need to format answer box to accept all close answers

In [5]:
correct = data[data['outcome'] == "CORRECT"]

In [7]:
correct.problem_id.unique()

array([8635, 7674, 1298, ..., 2275, 2274, 3678], dtype=int64)

In [58]:
info = data.groupby('outcome')['problem_id'].unique()['CORRECT']

In [63]:
truth_series = data.problem_id.isin(info)

In [69]:
g = data[truth_series]
g

Unnamed: 0,stud_id,duration,student_response_type,problem_id,step,attempt_num,last_attempt,outcome,input,feedback,hint,correct,incorrect
0,Stu_0012ae7d5d1993619f0bad91630b7c79,576.0,HINT_REQUEST,22018,"Step0:8788:Lines m and n are parallel, what is...",1.0,1.0,HINT,,System displays first scaffold,1,0,0
1,Stu_0012ae7d5d1993619f0bad91630b7c79,75.0,HINT_REQUEST,22018,Step1:8792:What is the measure of angle 2? SYSTEM,1.0,0.0,HINT,,Lines m and n are parallel and intersected by ...,1,0,0
2,Stu_0012ae7d5d1993619f0bad91630b7c79,37.0,ATTEMPT,22018,Step1:8792:What is the measure of angle 2? SYSTEM,2.0,1.0,INCORRECT,55,That is not correct. Try again.,0,0,1
3,Stu_0012ae7d5d1993619f0bad91630b7c79,166.0,ATTEMPT,21865,Step0:8635:Jacqueline is a runner who hopes to...,1.0,1.0,CORRECT,9 miles,,0,1,0
4,Stu_00176143fc2991346939888160a9b1ce,14.0,HINT_REQUEST,20904,Step0:7674:Lee correctly answered 26 out of 51...,1.0,1.0,HINT,,System displays first scaffold,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266395,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,99.0,HINT_REQUEST,566,"Step2:568:Good, there are 4 rectangles of size...",3.0,1.0,HINT,,The same rectangle of size 1 can be used in di...,1,0,0
1266396,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,28.0,ATTEMPT,698,Step0:698:The table above shows the annual sal...,1.0,0.0,INCORRECT,"C. $27,000",The difference has to be positive.,0,0,1
1266397,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,6.0,ATTEMPT,698,Step0:698:The table above shows the annual sal...,2.0,1.0,INCORRECT,"A. $25,500",,0,0,1
1266398,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,103.0,ATTEMPT,566,Step0:566:How many rectangles are there in Fig...,1.0,1.0,INCORRECT,9,,0,0,1


In [56]:
df[df["problem_id"]==7].step

689070     Step0:7:Figure ABCD is translated so that the ...
689071     Step1:8:First let s find how point A is relate...
689150     Step0:7:Figure ABCD is translated so that the ...
706712     Step0:7:Figure ABCD is translated so that the ...
706713     Step1:8:First let s find how point A is relate...
                                 ...                        
1239510    Step3:10:Point B is related to the image of po...
1239511    Step3:10:Point B is related to the image of po...
1239512    Step3:10:Point B is related to the image of po...
1239513    Step3:10:Point B is related to the image of po...
1239514    Step4:11:What is the y-coordinate of the image...
Name: step, Length: 616, dtype: object