In [1]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import pickle
import copy
from matplotlib import pyplot as plt

import nltk
from nltk.parse import CoreNLPParser
import statistics

from word_freq import preprocess

from collections import Counter

from sklearn import preprocessing

In [2]:
grade = "3"
target_path = f"../../data/question_correctness/.csv"
scores_path = "../../Gates.ReadComp_By-Item_Gr3-5(CM).xlsx"
corpus_path = f"../../subtest_txt/gr{grade}_paragraphs.txt"
questions_path = f"../../subtest_txt/gr{grade}_questions.txt"

### Subtest Embedding

In [3]:
sub_tests = []

with open(corpus_path,'r') as fp:
    sub_test = fp.readline()
    while sub_test:
        sub_tests.append(sub_test)
        sub_test = fp.readline()


df = pd.read_excel(scores_path)
sub_test_number = df.columns[1:]

print(sub_test_number)

questions_ranges = [(1,5), (6,8), (9,13), (14,16), (17,21), (22,27), (28,30),
                   (31,35), (36,40), (41,43), (44,48)]



Index(['Gr3.RC.Gates_01', 'Gr3.RC.Gates_02', 'Gr3.RC.Gates_03',
       'Gr3.RC.Gates_04', 'Gr3.RC.Gates_05', 'Gr3.RC.Gates_06',
       'Gr3.RC.Gates_07', 'Gr3.RC.Gates_08', 'Gr3.RC.Gates_09',
       'Gr3.RC.Gates_10',
       ...
       'Gr5.RC.Gates_45', 'Gr5.RC.Gates_46', 'Gr5.RC.Gates_47',
       'Gr5.RC.Gates_48', 'Gr5.RC.Gates_RawScore', 'Gr5.RC.Gates_GradeEquiv',
       'Gr5.RC.Gates_NCE', 'Gr5.RC.Gates_NPR', 'Gr5.RC.Gates_NS',
       'Gr5.RC.Gates_ESS'],
      dtype='object', length=162)


In [7]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')

#Change the length to 500
model.max_seq_length = 500

sub_tests_embed = model.encode(sub_tests, show_progress_bar=True)

print(sub_tests_embed.shape)

Batches: 100%|██████████| 1/1 [00:05<00:00,  5.08s/it]

(11, 768)





### Subtest rare words

In [8]:
sub_tests_rare = []
def get_rare_words_perc(text,wordlist):
    tokens = preprocess(text)
    counter = 0
    for t in tokens:
        if t not in wordlist:
            counter += 1
    return counter/len(tokens)

with open("wordlist.pkl",'rb') as fp:
    wordlist = pickle.load(fp)

for st in sub_tests:
    sub_tests_rare.append(get_rare_words_perc(st, wordlist))
    
print(sub_tests_rare)
print(sum(sub_tests_rare)/len(sub_tests_rare))

[0.4358974358974359, 0.4146341463414634, 0.41818181818181815, 0.5161290322580645, 0.5, 0.55, 0.42424242424242425, 0.5660377358490566, 0.45098039215686275, 0.47368421052631576, 0.4583333333333333]
0.4734655026169795


In [9]:

print(np.mean(sub_tests_rare,axis=0))
print(np.std(sub_tests_rare,axis=0))
print(np.min(sub_tests_rare,axis=0))
print(np.max(sub_tests_rare,axis=0))

0.4734655026169795
0.05048645170960405
0.4146341463414634
0.5660377358490566


In [None]:
with open("awl.txt",'r') as file:
    corpus = file.read().replace('\n', ' ')
tokens = preprocess(corpus)
freq = Counter(tokens)
print(freq)
rare_list = freq.keys()

for i in range(len(sub_tests)):
    print(i)
    tokens = preprocess(sub_tests[i])
    counter = 0
    for t in tokens:
        if t in rare_list:
            print(t)
            counter += 1
    print('total:', counter)

### Calculate parse tree depth

In [13]:
def get_average_depth(parser,text):
    sentences = nltk.sent_tokenize(text)
#     print(sentences)
    depths = []
    for s in sentences:
        parse = next(parser.raw_parse(s))
#         parse.draw()
        depths.append(parse.height())
        
    return statistics.mean(depths)

def get_depth_stats(parser,text):
    sentences = nltk.sent_tokenize(text)
#     print(sentences)
    depths = []
    for s in sentences:
        parse = next(parser.raw_parse(s))
#         parse.draw()
        depths.append(parse.height())
        
    return [statistics.mean(depths), statistics.stdev(depths), max(depths)]
#     return [min(depths), max(depths)]

parser = CoreNLPParser(url='http://localhost:9000')

# parse = next(parser.raw_parse(nltk.sent_tokenize('Snow turns blue when blue iceworms live in it.')[0]))
# print(parse.height())
# parse.draw()



### Questions and Answers Embedding

In [10]:
questions = []
answers = []

with open(questions_path,'r') as fp:
    for _ in range(48):
        question = fp.readline()
        answer = fp.readline()
#         print(question,answer)
        questions.append(question)
        answers.append(answer)
        
print(len(questions))
print(len(answers))



48
48


In [11]:
questions_embed = model.encode(questions, show_progress_bar=True)
answers_embed = model.encode(answers, show_progress_bar=True)

print(questions_embed.shape)
print(answers_embed.shape)

Batches: 100%|██████████| 2/2 [00:01<00:00,  1.33it/s]
Batches: 100%|██████████| 2/2 [00:01<00:00,  1.72it/s]

(48, 768)
(48, 768)





### Sentence Complexity

In [14]:
depths = []
questions_depth = []
answer_depth = []

for text in sub_tests:
    depths.append(get_depth_stats(parser,text))

for q in questions:
    questions_depth.append(get_average_depth(parser,q))


In [32]:
print(depths)

[[9, 2.0, 12], [8.75, 3.1959796173138706, 15], [8, 1.3416407864998738, 11], [7.375, 1.9226098333849673, 11], [8.235294117647058, 2.305683514836378, 12], [10.333333333333334, 3.9619401430321606, 16], [8, 3.2145502536643185, 13], [9.571428571428571, 2.14919697074224, 14], [8.071428571428571, 2.234839031005476, 11], [8.75, 1.9086270308410553, 11], [7.181818181818182, 2.238970078627004, 12]]


In [15]:
complexity = []
for index, q_range in enumerate(questions_ranges):
    entry = list(depths[index])
    for j in range(q_range[0],q_range[1]+1):
        new_entry = entry.copy()
        new_entry.append(questions_depth[j-1])
        print(questions_depth[j-1])
        print(new_entry)
        
        complexity.append(new_entry)

7
[9, 2.0, 12, 7]
6
[9, 2.0, 12, 6]
7
[9, 2.0, 12, 7]
6
[9, 2.0, 12, 6]
6
[9, 2.0, 12, 6]
7
[8.75, 3.1959796173138706, 15, 7]
9
[8.75, 3.1959796173138706, 15, 9]
6
[8.75, 3.1959796173138706, 15, 6]
7
[8, 1.3416407864998738, 11, 7]
9
[8, 1.3416407864998738, 11, 9]
8
[8, 1.3416407864998738, 11, 8]
8
[8, 1.3416407864998738, 11, 8]
10
[8, 1.3416407864998738, 11, 10]
9
[7.375, 1.9226098333849673, 11, 9]
7
[7.375, 1.9226098333849673, 11, 7]
10
[7.375, 1.9226098333849673, 11, 10]
10
[8.235294117647058, 2.305683514836378, 12, 10]
6
[8.235294117647058, 2.305683514836378, 12, 6]
7
[8.235294117647058, 2.305683514836378, 12, 7]
8
[8.235294117647058, 2.305683514836378, 12, 8]
8
[8.235294117647058, 2.305683514836378, 12, 8]
8
[10.333333333333334, 3.9619401430321606, 16, 8]
10
[10.333333333333334, 3.9619401430321606, 16, 10]
11
[10.333333333333334, 3.9619401430321606, 16, 11]
7
[10.333333333333334, 3.9619401430321606, 16, 7]
9
[10.333333333333334, 3.9619401430321606, 16, 9]
11
[10.333333333333334, 3.

In [35]:
# EDA, no need to run
complexity = np.asarray(complexity)
complexity.shape

(48, 4)

In [40]:
# EDA, no need to run
from scipy.stats.stats import pearsonr
print(pearsonr(complexity[:,2],complexity[:,3]))

(0.11015698339985273, 0.4560630509727088)


In [13]:
# EDA, no need to run
depths = np.asarray(depths)
from scipy.stats.stats import pearsonr
print(pearsonr(depths[:,0],depths[:,2]))

(0.7155376100784874, 0.013292661910529524)

In [23]:
#interactive plotting in separate window
%matplotlib qt 

depths_arr = np.asarray(depths)[:,0]
print(depths_arr)
print(len(sub_tests))
plt.plot(np.arange(1,12),depths_arr,'--',label='Mean',color='#757559')
plt.fill_between(np.arange(1,12), depths_arr - np.asarray(depths)[:,1],
                 depths_arr + np.asarray(depths)[:,1], color="#d6d6a3")#F5F5BA
plt.plot(np.arange(1,12),np.asarray(depths)[:,2],color='#757559', label='Max')
plt.xticks(np.arange(1, 12, step=1))
plt.yticks(np.arange(3, 17, step=1))
plt.xlabel('Text Number')
plt.ylabel('Sentence Complexity')
plt.tight_layout()
# plt.title('Subtest-wise Sentence Complexity Distribution')
plt.ylim(3,17)
# plt.xlim(1,11)
plt.legend()
plt.show()

[ 9.          8.75        8.          7.375       8.23529412 10.33333333
  8.          9.57142857  8.07142857  8.75        7.18181818]
11


In [52]:
new_depth = np.asarray(depths)

print(np.mean(questions_depth,axis=0))
print(np.std(questions_depth,axis=0))
print(np.min(questions_depth,axis=0))
print(np.max(questions_depth,axis=0))

8.375
1.943203969393503
6
13


### Lexical Surprisal

In [16]:
surprisal = pickle.load(open('surprisal.pkl','rb'))
surprisal = np.asarray(surprisal)
print(surprisal.shape)

for i in range(surprisal.shape[1]):
    v = surprisal[:, i]   # foo[:, -1] for the last column
    surprisal[:, i] = (v - v.min()) / (v.max() - v.min())
print(np.mean(surprisal,axis=0))
print(np.std(surprisal,axis=0))
print(np.min(surprisal,axis=0))
print(np.max(surprisal,axis=0))
surprisal = surprisal.tolist()

(12, 3)
[0.44681111 0.49253705 0.56750447]
[0.30261789 0.26272334 0.30222353]
[0. 0. 0.]
[1. 1. 1.]


In [30]:
surprisal = pickle.load(open('surprisal.pkl','rb'))
surprisal = np.asarray(surprisal)
surprisal_mean = np.asarray(surprisal)[1:,0]
surprisal_std = np.asarray(surprisal)[1:,1]
surprisal_max = np.asarray(surprisal)[1:,2]

plt.plot(np.arange(1,12),surprisal_mean,'--',label='Mean',color='#757559')
plt.fill_between(np.arange(1,12), surprisal_mean - surprisal_std,
                 surprisal_mean + surprisal_std, color="#d6d6a3")
plt.plot(np.arange(1,12),surprisal_max,color='#757559', label='Max')
plt.xticks(np.arange(1, 12, step=1))
plt.xlabel('Text Number')
plt.ylabel('Lexical Surprisal')
# plt.title('Subtest-wise Lexical Surprisal Distribution')
plt.ylim(0,0.140)
plt.tight_layout()
plt.legend()
plt.show()

### Feature names

In [17]:
feature_path = "../../SoR_Alberta.Shared.Data.and.Codebook.xlsx"#"../data/gr3/gr3_features.xlsx"
feature_names = ['G3.PPVT.Vocab.raw',
                 'G3.Elision.PA.raw',
                 'G3.Syn.GramCorrect.raw',
                 'G3.TOWRE.PDE.raw',
                 'G3.DigitSpan.raw',
                 'G3.WordID.raw',
                ]

df2 = pd.read_excel(feature_path)


In [18]:
# remove non-full entries
for name in feature_names:
    unavailable_index = df2[df2[name]<0].index
    df2.drop(unavailable_index , inplace=True)

In [19]:
# add towre scores
df2['G3.TOWRE.raw'] = df2['G3.TOWRE.SWE.raw'] + df2['G3.TOWRE.PDE.raw']

In [20]:
for name in feature_names:
    print(name)
    print(df2[name].describe())

# normalize all feature using min-max normalization
normalized_df2=(df2-df2.min())/(df2.max()-df2.min())


G3.PPVT.Vocab.raw
count    138.000000
mean      31.949275
std        4.930647
min       19.000000
25%       29.000000
50%       32.000000
75%       36.000000
max       46.000000
Name: G3.PPVT.Vocab.raw, dtype: float64
G3.Elision.PA.raw
count    138.000000
mean      23.014493
std        6.097727
min        8.000000
25%       17.250000
50%       25.000000
75%       28.000000
max       33.000000
Name: G3.Elision.PA.raw, dtype: float64
G3.Syn.GramCorrect.raw
count    138.000000
mean       9.565217
std        3.266634
min        2.000000
25%        7.000000
50%       10.000000
75%       12.000000
max       16.000000
Name: G3.Syn.GramCorrect.raw, dtype: float64
G3.TOWRE.PDE.raw
count    138.000000
mean      24.471014
std       12.803508
min        0.000000
25%       15.000000
50%       24.500000
75%       34.000000
max       51.000000
Name: G3.TOWRE.PDE.raw, dtype: float64
G3.DigitSpan.raw
count    138.000000
mean      13.173913
std        2.308072
min        8.000000
25%       12.000000
50%

### Assemble all features into one list

In [21]:
data = []


combined = []

for i in normalized_df2.index:

    skills = []
    # Retrieve all skill feature data
    for name in feature_names:
        value = normalized_df2[name][i]
        skills.append(value)
    
    for index, q_range in enumerate(questions_ranges):
        entry = []
        entry.append(skills+depths[index]+surprisal[index]+[sub_tests_rare[index]]) ### append depth and rare word feature
        entry.append(sub_tests_embed[index])
        
        for j in range(q_range[0],q_range[1]+1):
            # add embeddings
            detail = []
            detail.append(questions_embed[j-1])
            # add cfg depth
            new_entry = copy.deepcopy(entry)

            new_entry[0] += [questions_depth[j-1]]
            new_entry[0] = np.asarray(new_entry[0])
            
            if df[f"Gr{grade}.RC.Gates_"+"{:02d}".format(j)][i] == 1:
                detail.append(1)
            elif df[f"Gr{grade}.RC.Gates_"+"{:02d}".format(j)][i] in (2,0):
                detail.append(0)
            else:
                print('DNE')
            
            # skill features,sentence embedding, label, (reading number, question number)
            data.append(new_entry+detail+[(index,j)]) 

print(data[0][0])
print(data[400][4])
    

[ 0.51851852  0.72        0.21428571  0.43137255  0.25        0.7
  9.          2.         12.          0.47353246  0.46016297  0.44980291
  0.43589744  7.        ]
(4, 17)


## Save pre-computed features

In [56]:
with open("data.pkl",'wb') as fp:
    pickle.dump(data, fp)


In [57]:
with open("data.pkl",'rb') as fp:
    new_data = pickle.load(fp)