In [None]:
%cd ~/Dropbox/CanvasHacks

import pandas as pd
pd.options.display.max_rows = 999

from collections import namedtuple

#Plotting 
%matplotlib inline
from matplotlib import pyplot as plt
#http://blog.rtwilson.com/how-to-get-nice-vector-graphics-in-your-exported-pdf-ipython-notebooks/
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf')

import seaborn as sns
sns.set(style="whitegrid")
sns.set_palette(sns.color_palette('plasma'))

from IPython.display import display
from IPython.display import Latex

from CanvasHacks import environment
from CanvasHacks.Api.RequestTools import get_all_course_assignments, get_assignments_with_submissions
from CanvasHacks.Definitions.journal import Journal

# Import the Canvas class
from canvasapi import Canvas

# files
from CanvasHacks.Files.FileTools import makeDataFileIterator, makeDataFileList

# repos
from CanvasHacks.Repositories.students import StudentRepository
from CanvasHacks.Repositories.submissions import SubmissionRepository
# text tools
from CanvasHacks.Text.process import WordbagMaker, TokenFiltrationMixin
from CanvasHacks.Text.cleaners import TextCleaner
from CanvasHacks.Text.stats import WordFreq
import json
from afinn import Afinn
afinn = Afinn()

In [None]:
# This will be used to cut off the historical data
CURRENT_WEEK = 13

CURRENT_TERM = 'S20'

JOURNALS_FOLDER = "{}/journals".format(environment.LOG_FOLDER)
CONTENT_FOLDER ="{}/content".format(JOURNALS_FOLDER)  
BAG_FOLDER = "{}/bags".format(JOURNALS_FOLDER) 

course_ids = environment.CONFIG.course_ids
class_ids = {
    'F19': [62657, 67473, 62660],
    'F18': [41179, 41180, 41181],
    'S19': [67531],
    'S20': environment.CONFIG.course_ids
}

In [None]:

def make_content_filepath(term, week_num, course_id=None, folder=CONTENT_FOLDER, **kwargs):
    return "{}/{}-{}-week{}-content.json".format(folder, term, course_id, week_num)

def make_bag_filepath(term, week_num, course_id=None, folder=BAG_FOLDER, **kwargs):
    return "{}/{}-{}-week{}-bag.json".format(folder, term, course_id, week_num)


def make_week_iterator(start=7, stop=CURRENT_WEEK):
    for w in range(start, stop + 1):
        yield w


# acquire, clean, and store text

In [None]:

def get_all_journal_assigns_for_class(course_id, term):
    assignments = [ ]
    # Get list of all assignments for the courses
    assignments += get_all_course_assignments( course_id )
    assignments = [ (a[ 'id' ], a[ 'name' ].strip()) for a in assignments ]

    # If we we're passed an activity_inviting_to_complete, filter the assignments
    assignments = [a for a in assignments if Journal.is_activity_type(a[1])]

    assignments = [{ 'term': term, 
                    'course_id': course_id, 
                    'id': a[0], 
                    'week_num' : int(a[1].split(' ')[-1][ : -1])
                   } for a in assignments]
    return assignments



def store_course_journals(course_id, term, start_week=None):
    """
    Downloads and saves journals
    """
    # may want to run later with True so can look at uses of I/me for depression
    course = environment.CONFIG.canvas.get_course(course_id)
    journals = get_all_journal_assigns_for_class(course_id, term)
    cleaner = TextCleaner()
    for j in journals:
        # doing first so won't waste time
        fp = make_content_filepath(**j, )
        print(fp)
        if start_week is not None and j['week_num'] < start_week:
            pass

        else:
            # Download submissions
            assignment = course.get_assignment(j['id'])
            print("Downloading {} {}".format(j['term'], j['week_num']))
            subRepo = SubmissionRepository(assignment)
            print("{} journals downloaded".format(len(subRepo.data)))

            j['content'] =[{'sid': d.user_id, 'body': cleaner.clean(d.body)} for d in subRepo.data if d.body is not None]

            with open(fp, 'w') as f:
                json.dump(j, f)


In [None]:
# Get enrollments for courses
enrollments = {}

for term, ids in class_ids.items():
    cnt = 0
    for cid in ids:
        course = environment.CONFIG.canvas.get_course(cid)
        students = [u for u in course.get_users()]
        cnt += len(students) - 1
    enrollments[term] = cnt
enrollments

## Download and store journals

In [None]:
courses_to_get = [  ]

# for cid in courses_to_get:
#     store_course_journals(cid, 'S19')


for cid in environment.CONFIG.course_ids:
    store_course_journals(cid, 'S20', start_week=12)

## Make and store wordbags

In [None]:
def process_journal_entries(journal_entries, existing=[]):
    """
    Tokenizes and lightly filters a list of journal entries
    before saving to json
    """
    
    fp = make_bag_filepath(**journal_entries)
    print(fp)
    
    if fp not in existing:
        bagmaker = WordbagMaker(keep_stopwords=True)
        
        if len(journal_entries['content']) > 0: 
            for entry in journal_entries['content']:
#                 print(entry)
                if len(entry['body']) > 0:
                    entry['bag'] = bagmaker.process(entry['body'])

        with open(fp, 'w') as f:
            json.dump(journal_entries, f)
        
    return journal_entries

In [None]:
fiter = makeDataFileIterator(CONTENT_FOLDER)
existing = makeDataFileList(BAG_FOLDER)

while True:
    with open(next(fiter), 'r') as f:
        print("Processing ", f.name.split('/')[-1:])
        entries = json.load(f)
        process_journal_entries(entries, existing)


# Load bags

In [None]:
# A journal assignment which was done by one class on one week in one term
# Thus on one week in one term there may be multiple CourseJournals
CourseJournal = namedtuple('Week', ['term', 
                            # The canvas course id for the journal
                            # 'course_id', 
                             
                             # The canvas assignment id
                             'id', 
                             
                             'week_num', 
                             
                             # A list of dictionaries containing student
                             # journal entries.
                             # Each dictionary has the keys:
                             #    sid
                             #    body: The sanitized text of the journal
                             #    bag: List of word tokens, including stopwords
                             'content'])

class JournalAssignment(TokenFiltrationMixin):
    """
    A journal assignment which was done by one class on one week in one term
    Thus on one week in one term there may be multiple CourseJournals
    This holds the data and handles most calculations via properties
    """
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
        self._fix_errors()
#         self.token_filter = TokenFiltrationMixin()
    
    @property
    def bags(self):
        """Returns a list of all the student wordbags"""
        return [ s['bag'] for s in self.content]
    
    @property
    def combo_bag(self):
        """A wordbag comprising every word submitted by students 
        
        Removes punctuation which didn't get filtered when bag created
        """
        b = []
        [b.extend(l) for l in self.bags]
        return b
    
    @property
    def no_stops_bag(self):
        """A wordbag comprising every word submitted by students sans stopwords"""
        return [self.clean_punctuation(w) for w in self.combo_bag if self.keep(w, keep_stopwords=False)] # not in self.to_remove]
    
    @property
    def total_sentiment(self):
        return self.calc_sentiment(self.combo_bag)
    
    @property
    def average_sentiment(self):
        return self.total_sentiment / self.word_count

    @property
    def word_count(self):
        return len(self.combo_bag)
    
    @property
    def num_empty(self):
        return len([b for b in self.bags if len(b) == 0])
        
    @property
    def num_students(self):
        return len(self.content)
    
    @property
    def student_sentiments(self):
        return [self.calc_sentiment(bag) for bag in self.bags]
    
    @property
    def student_avg_sentiments(self):
        return [self.calc_sentiment(bag) / len(bag) for bag in self.bags if len(bag) > 0]
    
    
    def calc_sentiment(self, bag):
        """
        Calculates a total sentiment score of items in the bag
        """
        txt = ' '.join(bag)
        return afinn.score(txt)

    def _fix_errors(self):
        for c in self.content:
            # Not sure why but one entry from f19 has no bag.
            # maybe was the instructor or test student
            if 'bag' not in c.keys():
                c['bag'] = []
                
        

class TermWeekStore(TokenFiltrationMixin):
    """Represents a particular week in a particular term
    Handles combining multiple classes into one data store
    """
    
    def __init__(self, term, week, all_journals):
        self.term = term
        self.week = week
        self.journals = [ d for d in all_journals if d.term == self.term and d.week_num == self.week]
#             print(len(week_journals))
#         self.token_filter = TokenFiltrationMixin()

    
    @property
    def bags(self):
        """Returns a list of all the student wordbags"""
        b = []
        [b.extend(g.bags) for g in self.journals ]
        return b
#         return [ b for b in j.bags for j in self.journals]
    
    @property
    def combo_bag(self):
        """A wordbag comprising every word submitted by students """
        b = []
        [b.extend(l) for l in self.bags]
        return b
    
    @property
    def no_stops_bag(self):
        """A wordbag comprising every word submitted by students sans stopwords"""
        return [self.clean_punctuation(w) for w in self.combo_bag if self.keep(w, keep_stopwords=False)] # not in self.to_remove]
    
    @property
    def total_sentiment(self):
        return self.calc_sentiment(self.combo_bag)
    
    @property
    def average_sentiment(self):
        return self.total_sentiment / self.word_count

    @property
    def word_count(self):
        return len(self.combo_bag)
    
    
    @property
    def week_num(self):
        return self.week

In [None]:
fiter = makeDataFileIterator(BAG_FOLDER)
data = []

try:
    while True:
        with open(next(fiter), 'r') as f:
            d = json.load(f)
            o = JournalAssignment(**d)
            data.append(o)

except StopIteration:
    print("Loaded {} files".format(len(data)))

terms = list(set([e.term for e in data]))
weeks = list(set([e.week_num for e in data]))

week_stores = []
for t in terms:
    for w in weeks:
        week_stores.append(TermWeekStore(t, w, data))

len(week_stores)

In [None]:
fiter = makeDataFileIterator(BAG_FOLDER)
data = []

try:
    while True:
        with open(next(fiter), 'r') as f:
            d = json.load(f)
            o = JournalAssignment(**d)
            data.append(o)

In [None]:
# Extract statistics 

store = { t : { w : [] for w in weeks } for t in terms }
# store

Stat = namedtuple('Stat', [
    'term', 'week', 'total_sentiment', 
    
    # Total number of words submitted by all students for that week
    'word_count', 
    
    # The number of students who didn't submit the assignment
    'num_empty', 
    
    # The number of students enrolled for the week. 
    # Be careful using this because varies across terms
    'num_students'
])

NoStopStat = namedtuple('NoStopStat', Stat._fields)

stats_data = []

for t in terms:
    for w in weeks:
        try:
            week_journals = [ d for d in data if d.term == t and d.week_num == w]
#             print(len(week_journals))
            
            if len(week_journals) > 0:
                s = Stat(term=t,
                         week=w,
                         total_sentiment=sum([j.total_sentiment for j in week_journals]),
                         word_count=sum([j.word_count for j in week_journals]),
                         num_empty=sum([j.num_empty for j in week_journals]),
                         num_students=sum([j.num_students for j in week_journals]))
                stats_data.append(s)
            
        except IndexError:
            print('index error ', t, w)

stats_data = pd.DataFrame(stats_data)

# Calculate what percentage of enrolled studens turned the assignment in
# Note that we are not going to subtract out the num_empty because those may 
# be students who turned in a file whose contents we were unable to extract.
def calc_pct_comp(row, enrollments=enrollments):
    """Returns the percentage of enrolled students who have turned it in"""
    return row.num_students / enrollments.get(row.term)

stats_data['pct_completion'] = stats_data.apply(lambda x: calc_pct_comp(x), axis=1)

# make working copy
stats = stats_data[stats_data.week <= CURRENT_WEEK].copy(deep=True)
stats = stats[stats.word_count > 0]

# Calculate the total sentiment divided by word count, so we can compare
stats['sentiment'] = stats.apply(lambda x: x.total_sentiment / x.word_count, axis=1)


current = stats[stats.term == CURRENT_TERM]

# CURRENT_WEEK = current.week.max()
historical = stats[stats.term != CURRENT_TERM]
historical = historical[historical.week <= CURRENT_WEEK]

# Background

In every class I teach, students are required to submit a weekly journal. The assignment is extremely low stakes (this semester, each journal is worth 0.33% of the total course grade).

Journal assignments have no prescribed topic. The only requirement is that it be 'something related to class'. I tell them the point is to prompt them to reflect a bit on the class every week and to use the assignment in whatever way is useful for them. 

Most students use the journal to summarize recent materials. Some give me feedback on how the class is going. Many relate course topics to things in their lives and talk about their lives and how things are going generally.

Most students turn in the journals almost every week; most students miss a couple of them. Students report missing journals because of other commitments (e.g., exams) and disruptions in their personal life (e.g., changes of work schedule, feeling overwhelmed by some crisis). Missing 2 consecutive journals is a reliable sign that I should reach out to the student. (Indeed, it's reliable enough that I've written a script to automatically send a message gently inquiring about what's going on after 2 consecutive missed journals). 

Thus it seems possible that trends in journal submission are somewhat sensitive (but not specific) measures of student engagement with my class and perhaps their overall well-being.

## Population
All data here is for my Philosophy 305 Business ethics course. Most of my students are juniors/seniors. Most have majors or premajors in COBAE, though there's a good smattering of majors across the colleges. Total enrollments: {'F18': 115, 'S19': 124, 'F19': 167, 'S20': 159}

One major complication is course modality. The classes in F18 and S19 were all face-to-face. F19 was a mix of hybrid and face-to-face (approx 50-50). All classes in S20 were scheduled as fully online.

# Engagement

In [None]:
fig, axes = plt.subplots() #figsize=(9,3))
sns.lineplot(x="week", y="pct_completion", data=historical, label='historical', ax=axes)
g = sns.lineplot(x='week', y="pct_completion", color='red', label='current', data=current, ax=axes)
g.set_title("Students submitting / Enrollment")
g.axvline(x=9, label='S20 spring break', color='green', linestyle='--')
fig.tight_layout()

Purple line is the mean completion percentage for all previous semesters. 

Shaded region is bootstrapped 95% CI from historical data. (I'm not sure why breaks down in weeks 5-7)

Vertical dashed line is Spring 20's spring break. Note that historical data combines fall and spring semesters.


### Interpretation
It looks clear that between the week 7 and 8 journals, the percentage of students completing journals fell substantially and has remained below historical average trend.


### Notes
With one exception, journals are due at the end of the named week. Thus the week 7 journal was due at 11.59 pm March 8. However, there is a 1 week grace period, so the last opportunity to turn in the week 7 journal was March 15. The vast majority of those who turn in a journal do so before the grace period. Unfortunately, the one exception is this semester's week 8 journal. I accidentally had it due (3/22) after spring break. That may explain the large drop --I need to look more closely. Though it doesn't explain the below trend rates for weeks 10 and 11.


I've been moving the class away from its semi-synchronous design to a mostly asynchronous model. We made the final step in that direction last week. The journals are the one assignment which still must be done on a set schedule by everyone. I really, really hope the recent precipitous drop is due to students being confused by the changes.... 

Interestingly, the drop which historically occurs in week 5 consists almost entirely of students who otherwise do well on the exams and complete most higher stakes assignments. It's also not made up of the same students every week --there's not a big group that stops doing them completely, but there is a group which starts doing them more sporadically. Why week 5? My guess is that that's when the first accounting and business gateway midterms hit.

Since this is the first semester I've taught fully-online and the structure of the course is very different, I'm unsure how to interpret the trends this semester, even before week 8 when it was becoming clear that the university would go online.

### Tables (including sentiment)

In [None]:
df = current.drop(['num_empty'], axis=1).set_index(['term', 'week'])
display(Latex(df.to_latex(caption="Current semester")))

In [None]:
df = historical.drop(['num_empty'], axis=1).set_index(['term', 'week'])
display(Latex(df.to_latex(caption="Past semesters")))

# Sentiment score

## What this is
To get an extremely rough sense of what students are feeling, I've repurposed some tools I use to analyze pain patient narratives and patient twitter use. 


## Method
What I'm calling the 'sentiment score' is based on a wordlist approach that's validated on twitter 
data. 

For each week of each semester, I have combined all the text submitted by students in all classes. Thus all analysis is happening on a week in a term and lumps together multiple classes. I have removed the most common English words (stopwords) and common words like 'swenson', 'business', 'journals', et cetera.

I then used the Afinn wordlist to assign each word a score between -3 and 3 based on whether it tends to be used in negatively or positively. These scores are totaled and divided by the word count to allow comparison between collections of different length. 

## Limitations
Please do not read anything into the scores. They are NOT measures of student satisfaction with the class. They simply show the relative amounts of words which tend to be used positively and words which tend to be used negatively.

I am extremely suspicious of using a tool validated on twitter data for other uses, though I am unaware of any clear evidence justifying that suspicion.

Even if the sentiment scores are valid, there are a lot of confounding factors in interpreting them. For example, week 2-4 is probably so negative because that's when we are discuss prisoners' dilemmas and tragedies of the commons. The journals are full of words like 'tragedy', 'disaster', 'prison', 'cheat', 'deceit', et cetera.

Thus the absolute scores are pretty useless. However, there could be some value in comparing scores across semesters for the same week. There are still lots of limitations here. Inter alia, the topics don't exactly align, dates of breaks and exams are different, and the structure of the present semester is completely different. For example, we will not be talking about harm and the harm principle this semester, though it was previously a main topic. 

With all those caveats firmly in mind...

## Current semester vs. past mean

In [None]:
fig, axes = plt.subplots() #figsize=(9,3))
sns.lineplot(x="week", y="sentiment", data=historical, label='historical', ax=axes)
g = sns.lineplot(x='week', y="sentiment", color='red', label='current', data=current, ax=axes)
g.set_title("Total sentiment score / word count by week")
g.axvline(x=9, label='S20 spring break', color='green', linestyle='--')
fig.tight_layout()

### Interpretation

Bearing in mind the extremely tenuous nature of this analysis, it looks like we have the reverse of what we saw above in the percentage of students completing journals. It appears that in week 7, the language used shifted to tend more positive than in the past. A similar jump occurs in week 12. 

The fact that the big drops in submissions in week 7 and week 12 accompany increases in the sentiment score is interesting. This suggests that students who are more prone to use negative-tending words stopped submitting journals. 

Insofar as the sentiment score reflects what students are feeling, it may be that the students who feel worse are disengaging with my class. That's extremely troubling.

### Limitations

This needs to be taken with a truckload of salt. 

#### What is the sentiment score tracking?
To reiterate, it is far from clear what the sentiment score tracks. 

Indeed, I'm reluctant to try formally testing any hypothesis here without a sentiment scoring tool that's validated on student journals. I've refrained from examining/presenting statistical measures for this reason. I have sporadically worked on a machine learning based tool for this in the past; but I don't expect it to ever really work.

#### Misalignment of topics
Again, it is possible that some of the difference between the present and past semesters is due to a misalignment of class topics. Indeed, the topics discussed around week 12 in past semesters include the concept of harm and harms related to information security / data privacy. I've removed those topics this semester because the discussion is often very personal and raw --students often volunteer difficult experiences including identity theft affecting family members, stalking, and undocumented students' concerns about state surveillance. I'm not confident in my ability to manage that discussion when I can't see the student in the corner tearing up and looking anxiously at the door. The topics will come back into alignment around week 13. Though, even then, our move to a more asynchronous model may raise the same problem.     

## Semesters disagreggated

(Not sure if there's anything interesting here or how best to visualize, hence the multiple plots.)

In [None]:
fig, axes = plt.subplots(figsize=(9, 3))
order = ['F18', 'S19', 'F19', 'S20']
sns.barplot(x="week", y="sentiment", hue="term", data=stats, hue_order=order, ax=axes)
fig.tight_layout()

In [None]:
fig, axes = plt.subplots(figsize=(9, 3))
sns.lineplot(x="week", y="sentiment", hue="term", data=stats, hue_order=order, ax=axes)
fig.tight_layout()

# Distribution of sentiment scores

Above, we were looking at sentiment scores applied to the entire corpus of text submitted by students. The following scores student's journal entry separately and displays the distributions of those scores. 

NB, looking at individual sentiment scores without a properly validated assessment tool may magnify the issues discussed above. But let's see...

In [None]:
individ_sentiments = { w : [] for w in weeks } 
inds = []
for d in data:
    g = 'current' if d.term == CURRENT_TERM else 'historic' 
    inds.extend( [ {'week' : d.week_num, 'student_sent': s, 'g': g} for s in d.student_avg_sentiments])

inds = pd.DataFrame(inds)
inds = inds[inds.week <= CURRENT_WEEK]

In [None]:
fig, axes = plt.subplots(nrows=2, figsize=(10,8))
sns.violinplot(x="week", y="student_sent", hue="g", split=True, palette='deep',  data=inds, ax=axes[0]);
sns.boxplot(x="week", y="student_sent", hue='g', palette='deep', data=inds, ax=axes[1]);
axes[0].set_ylim((-0.25, 0.25));axes[0].set_ylim((-0.4, 0.4))
fig.tight_layout()

# Ngrams (Current semester only)

What are students actually talking about...

In [None]:
from CanvasHacks.Text.ngrams import BigramGetter, TrigramGetter

In [None]:
bgrams = {}

tgrams = {}

for ws in week_stores:
    if ws.term == CURRENT_TERM:
        bgrams[ws.week] = BigramGetter()
        bgrams[ws.week].process(ws.no_stops_bag, min_freq=3, get_top=10)
        tgrams[ws.week] = TrigramGetter()
        tgrams[ws.week].process(ws.no_stops_bag, min_freq=3, get_top=10)
    

## bigrams

### Top PMI
Returns top 10 with highest Pointwise Mutual Information (i.e., which occur together more often than would be expected)

In [None]:
for w in range(7, CURRENT_WEEK +1):
    print("========== week {} ==========".format(w))
    for b in bgrams[w].topPMI:
        print(b)

### Top likelihood ratio
Likelihood ratio is similar to tf-idf. It relates the frequency of word1, frequency of word2, and frequency of word1 word2 in the corpus.

In [None]:
for w in range(7, CURRENT_WEEK +1):
    print("========== week {} ==========".format(w))
    for b in bgrams[w].top_likelihood_ratio:
        print(b)

### Raw frequencies (top 20)

In [None]:
for w in range(7, CURRENT_WEEK +1):
    print("========== week {} ==========".format(w))
    for b in bgrams[w].raw_freq[:20]:
        print(b)

In [None]:
d = pd.Series([f for gram, f in bgrams[8].raw_freq])
g = sns.distplot(d)
g.set_title('distribution of frequencies (wk 8)')

## Trigrams
### Top PMI

In [None]:
for w in range(7, CURRENT_WEEK +1):
    print("========== week {} ==========".format(w))
    for t in tgrams[w].topPMI:
        print(t)

### Likelihood ratio

In [None]:
for w in range(7, CURRENT_WEEK +1):
    print("========== week {} ==========".format(w))
    for t in tgrams[w].top_likelihood_ratio:
        print(t)

### Raw frequencies (top 20)

In [None]:
for w in range(7, CURRENT_WEEK +1):
    print("========== week {} ==========".format(w))
    for t in tgrams[w].raw_freq[:20]:
        print(t)

# Individual word freqs

In [None]:
from CanvasHacks.Text.stats import WordFreq

## Calculate and clean

SLOW: ~10 min

In [None]:
CHANGES_CUTOFF = 30

freqs = []

for d in week_stores:
    fq = WordFreq(d.no_stops_bag)
    for dct in fq.word_freq_dicts:
        ddd = {
            'term':  d.term,
            'week' :  d.week_num,
            'word': dct['word'],
            'freq': dct['count']
            }
        freqs.append(ddd)
freqs = pd.DataFrame(freqs)

# Make data for term frequency
past_sem = freqs[freqs.term != CURRENT_TERM]
# unnecessary for the past sem since we get the counts in a different way
cur_sem = freqs[freqs.term == CURRENT_TERM].drop(['term'], axis=1)

avg_word_counts = []
for g, v in past_sem.groupby(['week', 'word']):
    avg_word_counts.append({
        'week': g[0], 
        'word': g[1], 
        'avg_count' : v['freq'].mean()
    })

avg_word_counts = pd.DataFrame(avg_word_counts)
# put in one frame
a = avg_word_counts.set_index(['week', 'word'])
b = cur_sem.set_index(['week', 'word'])
comb = pd.concat([a, b], axis=1)
comb.reset_index(inplace=True)
comb.fillna(0, inplace=True)

def calc_delta(row):
    f = row.freq if not pd.isnull(row.freq) else 0
    return f - row.avg_count

comb = comb[~pd.isnull(comb.freq)]
comb['cnt_delta'] = comb.apply(lambda x: calc_delta(x), axis=1)

top_changes = []

for w in weeks:
    week_frame = comb[comb.week == w].copy(deep=True)
    sorted_week = week_frame.sort_values('cnt_delta', ascending=False)
    r = { 'week': w, 
         'top_increased':  sorted_week[ : CHANGES_CUTOFF],
         'top_decreased': sorted_week[-CHANGES_CUTOFF : ]
        }
    top_changes.append(r)
# top_changes

## List changed frequencies

In [None]:
from IPython.display import display
from IPython.display import Latex

def display_increased_freq(frame, week):
    f = [f for f in filter(lambda x: x['week'] == week, frame)][0]
    t = "Top increased freq (week {})".format(week)
    display(Latex(f['top_increased'].to_latex(caption=t)))

def display_decreased_freq(frame, week):
    f = [f for f in filter(lambda x: x['week'] == week, frame)][0]
    t = "Top decreased freq (week {})".format(week)
    display(Latex(f['top_decreased'].to_latex(caption=t)))

for w in range(7, CURRENT_WEEK +1):
    if w != 9: #spring break
        display_increased_freq(top_changes, w)
        display_decreased_freq(top_changes, w)

# Wordclouds (term use and change from past)

In [None]:
import wordcloud
from CanvasHacks.Text.VisualizationTools import draw_cloud, draw_cloud_from_freqs, draw_cumulative_freq, clearplot_function

In [None]:
def draw_wordclouds(frame, week):
    """Draws frequency wordclouds"""
    d = frame[frame.week == week].copy(deep=True)

    # make back into freqdist-like objects
    past_dists = {}
    current_dists = {}
    increases = {}
    decreases = {}

    for i, row in d.iterrows():
        past_dists[row.word] = row.avg_count
        current_dists[row.word] = row.freq
        if row.cnt_delta > 0:
            increases[row.word] = row.cnt_delta
        if row.cnt_delta < 0:
            decreases[row.word] = row.cnt_delta * -1

    draw_cloud_from_freqs(current_dists, title="Current semester frequencies (week {})".format(week))
#     draw_cloud_from_freqs(past_dists, title="Past semester frequencies (week {})".format(week))
    draw_cloud_from_freqs(increases, title="Increased frequency over past (week {})".format(week))
    draw_cloud_from_freqs(decreases, title="Decreased frequency over past (week {})".format(week))

In [None]:
for w in range(7, CURRENT_WEEK +1):
    if w != 9: #spring break
        draw_wordclouds(comb, w)

In [None]:
clearplot_function()

# Attic

In [None]:
j = []
for w, v in comb.groupby('week'):
    z = v.cnt_delta.max()
#     z = v.sort_values('cnt_delta', axis=1)[:5]
    j.append((w, z))
len(j)

In [None]:
j[0]

In [None]:
# All the stored bags may have been cleaned in different ways
# when stored. Thus we clean again to make sure standard
token_filter = TokenFiltrationMixin()

def filter_on_regex(word, rx=token_filter.to_remove_inc_stops_regex):
    if rx.match(word) is None:
        return word

freqs.word = freqs.apply(lambda x: filter_on_regex(x.word), axis=1)
freqs.dropna(inplace=True)
len(freqs)

journal entries

    term: S/F year (S18, F20)
   
    course_id: Canvas course id
   
    id': canvas id of the journal assignment 
    
    week_num: Integer of the week of the joural 
    
    content: List of journal entries

journal entry

     sid: Author's canvas id
        
     body: Text after removing html tags. NB., may be blank of student never turned in. Keeping this since can be proxy for engagement
        
     bag: Wordbag including stopwords

In [None]:
# Aggregated by week

week_stats = []
hg = historical.groupby('week')
for week, h in hg:
    s = {'week' : week,
         'mean_word_count' : h.word_count.mean(),
         'avg_word_sentiment' : h.avg_word_sentiment.mean()}
    week_stats.append(s)
week_stats = pd.DataFrame(week_stats)

# week_stats

In [None]:
def make_datelist(words):
    datelist = []
    for w in words:
        try:
            d = pd.to_datetime(w)
            datelist.append(w)
        except (TypeError, ValueError):
            pass
    return datelist

datelist = make_datelist(w.word.tolist())
len(datelist)
        


In [None]:
jj.set_index(['week', 'word'])
jj.iloc[(1, "'*")]

In [None]:
len(freqs)

In [None]:
fq.plot(20)

## Make for terms


    term: S/F year (S18, F20)
   
    week_num
    
    bag: Concatenation of all bags for the week

In [None]:
for d in data:
    print(d.term, d.week_num)
    print(d.word_count)
    assert(d.word_count is not None)

In [None]:
def calc_sentiment(bag):
    if len(bag) == 0:
        return None
    txt = ' '.join(bag)
    return afinn.score(txt)

t =['do',
 'it',
 'that',
 'bad']
calc_sentiment(t)

In [None]:
Stat(
            term=term,
            week=week_num,
            total_sentiment=calc_sentiment(out['combo_bag']),
            word_count=len(out['combo_bag']),
            num_empty=num_empty,
            num_students=num_students)

In [None]:
def make_combo_bag(term, week_num, course_journals):
    """
    Combines all the wordbags for the week 
    and computes various stats along the way.
    Returns a dictionary with a combined wordbag and stat objects
    
    Entries is a list of 0 or more objects
    """
    out = {}
    out['combo_bag'] = []
    out['stat'] = None
    
    num_entries = 0
    
    if len(course_journals) > 0:
        # todo Add a parallel creation of no stop stat objects once
        # we have a stored no_stop_bag
        num_empty = 0    
        # Make the combo bag
        for e in entries[0]['content']:
            num_students = len(e['bag'])
            if num_students == 0:
                num_empty += 1
            out['combo_bag'] += e['bag']
        
        # Create statistics
        out['stat'] = Stat(
            term=term,
            week=week_num,
            total_sentiment=calc_sentiment(out['combo_bag']),
            word_count=len(out['combo_bag']),
            num_empty=num_empty,
            num_students=num_students)

    return out
    

In [None]:
stats_data = []

for t in terms:
    for w in weeks:
        try:
            entries = [ d for d in data if d['term'] == t and d['week_num'] == w]
            print(len(entries))
            try:
                r = make_combo_bag(t, w, entries)
                store[t][w] = r['combo_bag']
                stats_data.append(r['stat'])
            
            except IndexError:
                print('index error ', t, w)

            

#             combo_bag = []
#             if len(entries) > 0:
#                 # todo Add a parallel creation of no stop stat objects once
#                 # we have a stored no_stop_bag 
#                 num_empty = 0
#                 for e in entries[0]['content']:
#                     num_entries = len(e['bag'])
#                     if num_entries == 0:
#                         num_empty += 1
#                     combo_bag += e['bag']
                    

#                 store[t][w] = combo_bag
#                 s = Stat(term=t, 
#                          week=w, 
#                          total_sentiment=calc_sentiment(combo_bag), 
#                          word_count=len(combo_bag),
#                          num_empty=num_empty, 
#                         num_entries=num_entries)
#                 stats_data.append(s)

        except KeyError:
            pass


In [None]:
data

stat

    term
    
    week
    
    total_sentiment
    
    word_count
    
    num_empty
    
    num_entries

In [None]:
len(combo_bag)

In [None]:
entries = [ d for d in data if d['term'] == 'F18' and d['week_num'] == 10]

len(entries)

In [None]:
# Compare number of non submissions

In [None]:
for course_id, j in journals.items():
    assignment = environment.CONFIG.course.get_assignment(int(assignments[0][0]))
# Download submissions
subRepo = SubmissionRepository(assignment)

bodies = [d.body for d in subRepo.data]

In [None]:
bags = [bagmaker.process(TextCleaner.clean(b)) for b in bodies if b is not None]
len(bags)

In [None]:
scores = [afinn.score(b) for b in bodies if b is not None]
scores

In [None]:
bagmaker.process('I love dogs! Dogs are the best!')

In [None]:
afinn.score('I love dogs! Dogs are the best!')

In [None]:
afinn.score('I hate you Die! Die! Die! Die!')

In [None]:

results = []

for a in environment.CONFIG.assignments:
    # canvas api object
    assignment = course.get_assignment(int(a[0]))
    # activity object to define the features 
    journal = Journal(**assignment.attributes)
    # Download submissions
    subRepo = SubmissionRepository(assignment)
    if GRADING_LATE:
        # parse out already graded submissions
        subRepo.data =[j for j in subRepo.data if j.grade != 'complete']

    # shove the activity onto a sub repo so it will resemble
    # a quizrepo for the grader
    subRepo.activity = journal
    # Initialize the package for results
    store = DataStoreNew(journal)
    # provisionally determine credit
    grader = JournalGrader(subRepo)
    store.results = grader.grade()

    results.append(store)

# Sentiment

ideas

calculate the sentiment score for each student's bag and display overlapping kdes for each week

calclulate a global class sentimenet score for each week

weight each word sentiment score by its tf-idf in the journal entry

In [None]:
def calc_average_sentiment(bag):
    return afinn.score(bag) / len(bag)

In [None]:
# aggregate 