In [31]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import pickle
import copy

import nltk
from nltk.parse import CoreNLPParser
import statistics

from sklearn import preprocessing

In [15]:
grade = "3"
target_path = f"../../data/question_correctness/.csv"
scores_path = "../../Gates.ReadComp_By-Item_Gr3-5(CM).xlsx"
corpus_path = f"../../subtest_txt/gr{grade}_paragraphs.txt"
questions_path = f"../../subtest_txt/gr{grade}_questions.txt"

### Subtest Embedding

In [16]:
sub_tests = []
with open(corpus_path,'r') as fp:
    sub_test = fp.readline()
    while sub_test:
        sub_tests.append(sub_test)
        sub_test = fp.readline()


df = pd.read_excel(scores_path)
sub_test_number = df.columns[1:]

print(sub_test_number)

questions_ranges = [(1,5), (6,8), (9,13), (14,16), (17,21), (22,27), (28,30),
                   (31,35), (36,40), (41,43), (44,48)]

model = SentenceTransformer('paraphrase-mpnet-base-v2')

#Change the length to 200
model.max_seq_length = 500

sub_tests_embed = model.encode(sub_tests, show_progress_bar=True)

print(sub_tests_embed.shape)

Index(['Gr3.RC.Gates_01', 'Gr3.RC.Gates_02', 'Gr3.RC.Gates_03',
       'Gr3.RC.Gates_04', 'Gr3.RC.Gates_05', 'Gr3.RC.Gates_06',
       'Gr3.RC.Gates_07', 'Gr3.RC.Gates_08', 'Gr3.RC.Gates_09',
       'Gr3.RC.Gates_10',
       ...
       'Gr5.RC.Gates_45', 'Gr5.RC.Gates_46', 'Gr5.RC.Gates_47',
       'Gr5.RC.Gates_48', 'Gr5.RC.Gates_RawScore', 'Gr5.RC.Gates_GradeEquiv',
       'Gr5.RC.Gates_NCE', 'Gr5.RC.Gates_NPR', 'Gr5.RC.Gates_NS',
       'Gr5.RC.Gates_ESS'],
      dtype='object', length=162)


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))


(12, 768)


### Calculate parse tree depth

In [41]:
def get_average_depth(parser,text):
    sentences = nltk.sent_tokenize(text)
#     print(sentences)
    depths = []
    for s in sentences:
        parse = next(parser.raw_parse(s))
#         parse.draw()
        depths.append(parse.height())
        
    return statistics.mean(depths)

def get_depth_stats(parser,text):
    sentences = nltk.sent_tokenize(text)
#     print(sentences)
    depths = []
    for s in sentences:
        parse = next(parser.raw_parse(s))
#         parse.draw()
        depths.append(parse.height())
        
    return [statistics.mean(depths), statistics.stdev(depths), max(depths)]
#     return [min(depths), max(depths)]

parser = CoreNLPParser(url='http://localhost:9000')

# parse = next(parser.raw_parse(nltk.sent_tokenize(sub_tests[7])[4]))
# print(parse.height())
# parse.draw()



### Questions and Answers Embedding

In [18]:
questions = []
answers = []

with open(questions_path,'r') as fp:
    for _ in range(48):
        question = fp.readline()
        answer = fp.readline()
#         print(question,answer)
        questions.append(question)
        answers.append(answer)
        
print(len(questions))
print(len(answers))

questions_embed = model.encode(questions, show_progress_bar=True)
answers_embed = model.encode(answers, show_progress_bar=True)

print(questions_embed.shape)
print(answers_embed.shape)

48
48


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=2.0), HTML(value='')))


(48, 768)
(48, 768)


### Sentence Complexity

In [28]:
depths = []
questions_depth = []
answer_depth = []

for text in sub_tests:
    depths.append(get_depth_stats(parser,text))

for q in questions:
    questions_depth.append(get_average_depth(parser,q))

for a in answers:
    answer_depth.append(get_average_depth(parser,a))

### Word Frequency

### Feature names

In [32]:
feature_path = "../../SoR_Alberta.Shared.Data.and.Codebook.xlsx"#"../data/gr3/gr3_features.xlsx"
feature_names = ['G3.PPVT.Vocab.raw',
                 'G3.Elision.PA.raw',
                 'G3.Syn.GramCorrect.raw',
                 'G3.OL.Spell.Total',
                 'G3.DigitSpan.raw',]
combined_feature_names = [
                 'G3.TOWRE.SWE.raw',
                 'G3.TOWRE.PDE.raw',
                 'G3.WordID.raw',
                 ]

### Assemble all features into one list

In [40]:
data = []
df2 = pd.read_excel(feature_path)

combined = []
# Calculate combined feature G3.word_recog
for i in df.index:
    unavailable = False
    # check if all data features are avaiable, else skip this entry
    for col in combined_feature_names:
        if df2[col][i] < 0:
            unavailable = True
            break

    if unavailable:
        continue
    # collect all skill features
    new_entry = []
    for name in combined_feature_names:
        new_entry.append(df2[name][i])

    combined.append(new_entry)

combined = np.asarray(combined)

print(combined.shape)

combined = preprocessing.normalize(combined, norm='max',axis=0)

word_recog = np.sum(combined, axis = 1)
print(word_recog.shape)

for i in df.index:
    is_available = True
    skills = []
    # Retrieve all skill feature data
    for name in feature_names:
        value = df2[name][i]
        if value < 0:
            is_available = False
            break
        skills.append(value)
    if not is_available:
        continue
    # the line below will cause issue when there's missing data in entries, due to unmatched index
    skills.append(word_recog[i])
    
    for index, q_range in enumerate(questions_ranges):
        entry = []
        entry.append(skills+depths[index])
        entry.append(sub_tests_embed[index])
        for j in range(q_range[0],q_range[1]+1):
            # add embeddings
            detail = []
            detail.append(questions_embed[j-1])
            detail.append(answers_embed[j-1])
            # add cfg depth
            new_entry = copy.deepcopy(entry)

            new_entry[0] += [questions_depth[j-1],answer_depth[j-1]]
            new_entry[0] = np.asarray(new_entry[0])
            
            if df[f"Gr{grade}.RC.Gates_"+"{:02d}".format(j)][i] == 1:
                detail.append(1)
            elif df[f"Gr{grade}.RC.Gates_"+"{:02d}".format(j)][i] in (2,0):
                detail.append(0)
            else:
                print('DNE')
            data.append(new_entry+detail)

print(data[0][0])
print(data[48][0])
    

(139, 3)
(139,)
[33.         26.          5.          8.         11.          1.83187109
  7.66666667  1.3662601  10.          7.          4.        ]
[29.         26.          7.          8.         11.          1.71471281
  7.66666667  1.3662601  10.          7.          4.        ]


In [22]:
with open("data.pkl",'wb') as fp:
    pickle.dump(data, fp)


In [23]:
with open("data.pkl",'rb') as fp:
    new_data = pickle.load(fp)

In [24]:
print(len(new_data[1][0]))
# print(len(new_data))

13
