Tools for grading and analyzing student journals

This is the non-public project notebook; for personal use

In [None]:
# Dynamically sets location so do not have to manually toggle between dropbox and documents
import os
path = "/".join([a for a in os.path.abspath("").split('/') if a not in ['Notebooks', 'personal']])
%cd $path

#Plotting 
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

from CanvasHacks import environment

from CanvasHacks.Api.RequestTools import get_all_course_assignments, get_assignments_needing_grading
from CanvasHacks.Api.UploadGradeTools import make_upload_button

from CanvasHacks.Configuration import InteractiveConfiguration
from CanvasHacks.Widgets.InputFields import make_course_ids_input, make_canvas_token_input, make_canvas_url_input, make_general_reset_button
# import CanvasHacks.GradingTools as GT
# import CanvasHacks.DownloadProcessingTools as PT
from CanvasHacks.Repositories.DataManagement import DataStore, DataStoreNew


# This aren't used in the non-saving version
# from CanvasHacks.Files.FileTools import makeDataFileIterator, create_folder
# from CanvasHacks.JournalsFileTools import get_journal_folders, make_folder_list, calculate_journal_counts
# from CanvasHacks.JournalsFileTools import journal_folder_name

from CanvasHacks.Widgets.AssignmentSelection import make_assignment_chooser, view_selected_assignments, view_ungraded_assignments
from CanvasHacks.Widgets.ConsolidatedTextOutput import make_assignment_header, make_consolidated_text_fields
from CanvasHacks.Widgets.LiveSelection import make_test_selector

from CanvasHacks.Definitions.journal import Journal

# Import the Canvas class
from canvasapi import Canvas
# Newfangled repos
from CanvasHacks.Repositories.students import StudentRepository

# Grading
from CanvasHacks.GradingHandlers.journal import JournalGrader


In [None]:
make_test_selector()

# force into testing harness
test_course_id = 85210
canvas = Canvas(environment.CONFIG.canvas_url_base, environment.CONFIG.canvas_token)
course = canvas.get_course(test_course_id)
environment.CONFIG.course = course 

# What needs grading

In [None]:
make_assignment_chooser(Journal)

In [None]:
course = environment.CONFIG.course
courses = [course]

# Initialize repositories
studentRepo = StudentRepository(courses)
studentRepo.download()

# Todo

Canvas doesn't allow partial credit on this sort of assignment it seems

Appears that docx are not getting content extracted

 # Download and process all ungraded journal submissions

    GET /api/v1/courses/:course_id/assignments/:assignment_id/submissions/:user_id 
    
 
 This version saves the downloaded data rather than holding in memory!
 
 
 ToDo
 
     5/12/20: Had to set penalizer to no penality since credit no credit doesn't allow 50% penalty

In [None]:
# as of CAN-40
GRADING_LATE = True

from CanvasHacks.Repositories.submissions import SubmissionRepository
results = []

for a in environment.CONFIG.assignments:
    print(int(a[0]))
    # canvas api object
    assignment = course.get_assignment(int(a[0]))
    # activity object to define the features 
    journal = Journal(**assignment.__dict__)
    # Download submissions
    subRepo = SubmissionRepository(assignment)
    if GRADING_LATE:
        # parse out already graded submissions
        subRepo.data =[j for j in subRepo.data if j.grade != 'complete']

    # shove the activity onto a sub repo so it will resemble
    # a quizrepo for the grader
    subRepo.activity = journal
    # Initialize the package for results
    store = DataStoreNew(journal)
    # provisionally determine credit
    grader = JournalGrader(subRepo)
    store.results = grader.grade()

    results.append(store)

# Read student work and check that properly categorized

In [None]:
for s in results:
    make_assignment_header(s)
    make_consolidated_text_fields(s, studentRepo)

# Make consolidated text file

In [None]:
# Todo: CAN-6 Fix this so only writes the files in results. currently writes from json storage
from CanvasHacks.Widgets.ConsolidatedTextOutput import make_consolidated_text_file
journal_folders = get_journal_folders()
f = journal_folders[4]
# make_consolidated_text_file(f, 'compiled-text.txt', studentRepo)

# Upload grades 

 PUT /api/v1/courses/:course_id/assignments/:assignment_id/submissions/:user_id 
 
 TODO: Add class identifier if have multiple classes
 
 TODO: Showing multiple copies of buttons

In [None]:
for store in results:
    make_upload_button(store)

# DON'T FORGET TO GO THROUGH GRADES ON CANVAS AND GIVE CREDIT WHERE MISSING!

# Some light text analysis

In [None]:
START_WEEK = 2
STOP_WEEK = 17
IGNORE_FILE = "%s/ignore.csv" % environment.DATA_FOLDER

In [None]:
from CanvasHacks.TextProcessing import make_wordbag
from CanvasHacks.TextProcessing import WordFreq

# Get all subfolder paths
from CanvasHacks.JournalsFileTools import get_journal_folders, make_folder_list, calculate_journal_counts
from CanvasHacks.JournalsFileTools import load_words_to_ignore, week_key_gen
from CanvasHacks.JournalsTextTools import filter_out_terms


## Load data

In [None]:
from CanvasHacks.DataManagement import BagStore

In [None]:
journal_folders = get_journal_folders()

wcnt = calculate_journal_counts(journal_folders)
wcnt   

In [None]:
def load_all_submissions_as_wordbags(filepath):
    """
    Returns a list containing lists of words in each student's stored
    submission
    filepath: should be to a file with the structure of all-submissions.json
    """
    with open(filepath, 'r') as f:
        j = json.load(f)
        return [make_wordbag(str(row['body'])) for row in j]


# Load all content as wordbags
journal_folders = get_journal_folders()
exclude_list = load_words_to_ignore(IGNORE_FILE)

store = BagStore()

for folder_name in journal_folders:
    fp = "%s/all-submissions.json" % folder_name
    assignment_name = folder_name.split('/')[-1:][0]
    print("loading {}".format(assignment_name))
    bags = load_all_submissions_as_wordbags(fp)
    bags = [filter_out_terms(bag, exclude_list) for bag in bags]
    store.add_assignment_bags(assignment_name, bags)


## Unigram visualizations

In [None]:
import wordcloud
from CanvasHacks.VisualizationTools import draw_cloud, draw_cloud_from_freqs, draw_cumulative_freq, clearplot_function

In [None]:
for name in store.assignment_names:
    f = store.get_assignment_frequencies(name)
    draw_cumulative_freq(f.freqDist, name)
    # plot the wordcloud
    draw_cloud_from_freqs(f.freqDist, name)

## Ngram tools

In [None]:
"""
Previously in TextProcessingTools
Created by adam on 11/11/15
"""
__author__ = 'adam'

import nltk

# This idiom is necessary. See https://github.com/nltk/nltk/issues/1516
from nltk.metrics import association



class NgramError(BaseException):
    def __init__(self, processing_step):
        """
        Arguments:
            :param processing_step: String description of where error arose
        :return:
        """
        super().__init__()
        self.kind = 'NgramProcessing'
        self.identifier_type = 'String content'
        self.step = processing_step
#         ProcessingError.__init__(self, processing_step)

class NgramGetter(object):
    """
    Abstract parent class for extracting ngrams.

    Attributes:
        collocation_finder: One of the nltk's collocation finder tools (e.g., BigramCollocationFinder)
        top_likelihood_ratio:
        measurement_tool: One of nltk's measurement tools (e.g., nltk.collocations.BigramAssocMeasures)
        modifiers: IModifier instantiating tool for modifying the text before calculating ngrams
        ngrams: List of ngrams
        raw_freq: Frequency distribution of ngrams
        sorted_ngrams: List of tuples sorted by self.scored_ngrams
        top_pmi: Variable number of n-grams with the highest Pointwise Mutual Information (i.e., which occur together
        more often than would be expected)
        word_bag: List of text to run
    """

    def __init__(self):
        self.modifiers = []
        self.ngram_filters = []
        self.word_bag = []
        self.ngrams = []
        if not self.measurement_tool:
            raise NotImplementedError

    def add_modifier(self, iModifier):
        assert(isinstance(iModifier, IModifier))
        self.modifiers.append(iModifier)

    def _run_modifiers(self):
        """
        Calls the modifiers in sequence and stores the results back in word_bag
        """
        for modifier in self.modifiers:
            self.word_bag = [modifier.process(w) for w in self.word_bag]

    def add_filter(self, iNgramFilter):
        """
        Adds a filter to be run after the ngrams are created
        :param iNgramFilter:
        :return:
        """
        self.ngram_filters.append(iNgramFilter)

    def apply_filters(self):
        for ftr in self.ngram_filters:
            self.collocation_finder.apply_ngram_filter(ftr)

    def process(self, word_bag, min_freq=3, get_top=10, **kwargs):
        """
        Runs any modifiers (stemmers, lemmatizers, etc) on the list of terms and
        then extracts the ngrams

        Args:
            get_top: The cut off for ngrams to get stats for
            min_freq: Integer of minimum number of appearances of ngram to extract
            word_bag: List of strings to extract ngrams from. Should already be filtered.
        """
        raise NotImplementedError

    def _calculate_statistics(self, get_top=10, **kwargs):
        """
                A number of measures are available to score collocations or other associations.
        The arguments to measure functions are marginals of a contingency table,
        in the bigram case (n_ii, (n_ix, n_xi), n_xx):
                w1    ~w1
             ------ ------
         w2 | n_ii | n_oi | = n_xi
             ------ ------
        ~w2 | n_io | n_oo |
             ------ ------
             = n_ix        TOTAL = n_xx
        We test their calculation using some known values presented
        in Manning and Schutze's text and other papers.
        Student's t: examples from Manning and Schutze 5.3.2
        Arguments:
            get_top: The cut off for ngrams to get stats for
        """
        self.topPMI = self.collocation_finder.nbest(self.measurement_tool.pmi, get_top)
        self.raw_freq = self.collocation_finder.score_ngrams(self.measurement_tool.raw_freq)
        self.sorted_ngrams = [ngram for ngram, score in self.raw_freq]
        self.top_likelihood_ratio = self.collocation_finder.nbest(self.measurement_tool.likelihood_ratio, get_top)


class BigramGetter(NgramGetter):
    """
    Extracts 2-grams from a word bag and calculates statistics
    Attributes:
        top_pmi: Variable number of n-grams with the highest Pointwise Mutual Information (i.e., which occur together
        more often than would be expected)
        top_likelihood_ratio:
        raw_freq: Frequency distribution of ngrams
        sorted_ngrams: List of tuples sorted by self.scored_ngrams
    """

    def __init__(self):
        self.measurement_tool = association.BigramAssocMeasures()
        NgramGetter.__init__(self)

    def process(self, word_bag, min_freq=3, get_top=10, **kwargs):
        """
        Arguments:
            word_bag: List of strings
        """
        assert(isinstance(word_bag, list))
        self.collocation_finder = nltk.collocations.BigramCollocationFinder.from_words(word_bag)
        self.collocation_finder.apply_freq_filter(min_freq)
        self._calculate_statistics(get_top)

class TrigramGetter(NgramGetter):
    """
        Extracts 3-grams from a word bag and calculates statistics
    """

    def __init__(self):
        self.measurement_tool = association.TrigramAssocMeasures()
        NgramGetter.__init__(self)

    def process(self, word_bag, min_freq=3, get_top=10, **kwargs):
        """
        Arguments:
            word_bag: List of strings
        """
        assert(isinstance(word_bag, list))
#         try:
        self._run_modifiers()
        self.collocation_finder = nltk.collocations.TrigramCollocationFinder.from_words(word_bag)
        self.collocation_finder.apply_freq_filter(min_freq)
        self._calculate_statistics(get_top)


### Find bigrams

In [None]:
combined = []
for name in store.assignment_names:
    f = store.get_assignment_frequencies(name)
    combined.append(f)

In [None]:
from nltk.probability import FreqDist

bigrams = {}
week_iter = week_key_gen()
i = START_WEEK

for name in store.assignment_names:
    f = store.get_assignment_frequencies(name)
    w = next(week_iter)

    bg = BigramGetter()
    bg.process(f.data)
    #store the bigram object in our dict
    bigrams[w] = bg

    # reshape into a usable form for plotting
    fs = FreqDist()
    for s, freq in bg.raw_freq:
        # make the string key and set the frequency
        fs["%s %s" % (s[0], s[1])] = freq

    # plot the cumulative frequencies
    draw_cumulative_freq(fs, i)

    i += 1

In [None]:
clearplot_function()

### Likelihood ratio

In [None]:
# Print top likelihood ratio
def print_top_likelihood_ratios(ngrams):
    week_iter = week_key_gen()

    while True:
        try:
            week = next(week_iter)
            print(week)
            [print(b) for b in ngrams[week].top_likelihood_ratio]
        except StopIteration:
            break

def print_top_PMI(ngrams):
    week_iter = week_key_gen()
    print("Top PMI")

    while True:
        try:
            week = next(week_iter)
            print(week)
            [print(b) for b in ngrams[week].topPMI]
        except StopIteration:
            break


In [None]:
print_top_likelihood_ratios(bigrams)

In [None]:
print_top_PMI(bigrams)

## Find trigrams

In [None]:
trigrams = {}
week_iter = week_key_gen()
i = START_WEEK


for name in store.assignment_names:
    f = store.get_assignment_frequencies(name)
    w = next(week_iter)

    bg = TrigramGetter()
        
    bg.process(f.data)
    #store the bigram object in our dict
    trigrams[w] = bg

    # reshape into a usable form for plotting
    fs = FreqDist()
    for s, freq in bg.raw_freq:
        # make the string key and set the frequency
        fs["%s %s %s" % (s[0], s[1], s[2])] = freq

    # plot the cumulative frequencies
    draw_cumulative_freq(fs, i)

    i += 1

# while True:
#     try:
#         w = next(week_iter)
        
#         bg = TrigramGetter()
#         bg.process(combined[w])
#         #store the trigram object in our dict
#         trigrams[w] = bg

#         # reshape into a usable form for plotting
#         fs = FreqDist()
#         for s, freq in bg.raw_freq:
#             # make the string key and set the frequency
#             fs["%s %s %s" % (s[0], s[1], s[2])] = freq

#         # plot the cumulative frequencies
#         draw_cumulative_freq({w : fs}, i)
#         i += 1
#     except StopIteration:
#         break

In [None]:
clearplot_function()

In [None]:
print_top_likelihood_ratios(trigrams)

In [None]:
print_top_PMI(trigrams)

# Attic

In [None]:
GRADING_LATE = True

from CanvasHacks.Repositories.submissions import SubmissionRepository
results = []

for a in environment.CONFIG.assignments:
    # canvas api object
    assignment = course.get_assignment(int(a[0]))
    # activity object to define the features 
    journal = Journal(**assignment.attributes)
    # Download submissions
    subRepo = SubmissionRepository(assignment)
    store = DataStoreNew(journal)
    # provisionally determine credit
    store.results = GT.new_determine_journal_credit(journal, subRepo)
    if GRADING_LATE:
        store.results = [j for j in store.results if j[0].grade != 'complete']

    results.append(store)


In [None]:

# Master: This handles downloading all assignments which need grading and 
# determines whether to give them credit. It does not yet upload grades
results = []

for course_id in environment.CONFIG.course_ids:
    print('course', course_id)
    for assignment_id, name in environment.CONFIG.assignments:
        store = DataStore(assignment_id=assignment_id, assignment_name=name, course_id=course_id)

        print("Processing {}".format(name))
        # make folder to save data
        folder = journal_folder_name(name, course_id)
        create_folder(folder)
        
        # download student submissions 
        response = PT.get_submissions(course_id, assignment_id)
        print("{} responses received".format(len(response)))
#         print(response)
        store.submissions = PT.process_response(response, folder)
        
        # save a copy 
        PT.save_submission_json(store.submissions, folder)
        
        # give credit for non-empty submissions
        c = GT.determine_credit(store.submissions)
        
        # stow the results in our data object
        store.credit = c['credit']
        store.no_credit = c['nocredit']
        store.print_counts()
        
        # Add the now full-of-data object to our results list
        results.append(store)


In [None]:
import pandas as pd

d =pd.DataFrame([{'a' :2, 'b': 3}, {'a' :12, 'b': 13}])
for i, r in d.iterrows():
    assert(isinstance(r, pd.Series))
    print(r.a)