In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import HDBSCAN

from pathlib import Path
from typing import Optional

## 1 - Data loading

In [2]:
data_dir = Path('../data')

sample_submission = data_dir / 'sample_submission.csv'
summaries_train = data_dir / 'summaries_train.csv'
summaries_test = data_dir / 'summaries_test.csv'
prompts_train = data_dir / 'prompts_train.csv'
prompts_test = data_dir / 'prompts_test.csv'


In [3]:
backend ='pyarrow'

prompts_df = pd.read_csv(prompts_train, dtype_backend=backend)
summaries_df = pd.read_csv(summaries_train, dtype_backend=backend)

In [4]:
prompts_df

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [5]:
summaries_df

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757
...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128


In [6]:
df_train = pd.merge(summaries_df, prompts_df, how='inner', on='prompt_id')
assert len(df_train) == len(summaries_df) # Make sure we're not loosing rows on the join

In [7]:
df_train

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
...,...,...,...,...,...,...,...,...
7160,fef3e85236e5,39c16e,"It has to be made on a complex storyline, with...",-0.981265,-1.548900,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...
7161,ff0f65eecf02,39c16e,Aristotle descirbes an ideal tradgedy as being...,-0.511077,-1.589115,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...
7162,ff186473ea0a,39c16e,A tragedy should have a complex plan not a sim...,-0.834946,-0.593749,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...
7163,ff5e9e6068da,39c16e,Aristotle believed that the ideal tradegy shou...,-0.157460,-0.165811,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...


In [8]:
del prompts_df, summaries_df, backend

In [9]:
def make_split(summaries_path: Path, prompts_path: Path, dtype_backend: Optional[str] = 'pyarrow') -> pd.DataFrame:
    summaries_df = pd.read_csv(summaries_path, dtype_backend=dtype_backend)
    prompts_df = pd.read_csv(prompts_path, dtype_backend=dtype_backend)
    df = pd.merge(summaries_df, prompts_df, how='inner', on='prompt_id')

    if len(df) != len(summaries_df):
        raise AssertionError('Could not match all prompt ids to a prompt')
    
    return df

In [10]:
train_df = make_split(summaries_train, prompts_train)

In [11]:
train_df

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background The Third Wave experiment took pl...
...,...,...,...,...,...,...,...,...
7160,fef3e85236e5,39c16e,"It has to be made on a complex storyline, with...",-0.981265,-1.548900,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...
7161,ff0f65eecf02,39c16e,Aristotle descirbes an ideal tradgedy as being...,-0.511077,-1.589115,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...
7162,ff186473ea0a,39c16e,A tragedy should have a complex plan not a sim...,-0.834946,-0.593749,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...
7163,ff5e9e6068da,39c16e,Aristotle believed that the ideal tradegy shou...,-0.157460,-0.165811,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 As the sequel to what has already...


## 2 - Distribution of target columns

In [12]:
fig = px.violin(train_df, x=['content', 'wording'], title='Distributions')
fig.show()

fig = px.scatter(train_df, x='content', y='wording')
fig.show()

train_df[['content', 'wording']].corr()

Unnamed: 0,content,wording
content,1.0,0.75138
wording,0.75138,1.0


Medium / strong positive correlation between content and wording scores, but lots of weird noise + seemingly clustered points

In [13]:
df_train['diff'] = df_train['content'] - df_train['wording']

fig = px.histogram(df_train, x='diff', title='Difference between content and wording scores')
fig.update_layout(xaxis_title='content - wording')
fig.show()


fig = px.violin(df_train, x='diff', title='Distribution of difference between content and wording scrores')
fig.update_layout(xaxis_title='content - wording', yaxis_title='frequency')
fig.show()
(df_train['content'] - df_train['wording']).describe()

count    7165.000000
mean        0.048219
std         0.733257
min        -2.365703
25%        -0.414440
50%        -0.052624
75%         0.465765
max         3.395007
dtype: double[pyarrow]

In [14]:
hdb = HDBSCAN(min_cluster_size=18, cluster_selection_epsilon=0.15, cluster_selection_method='leaf', algorithm='brute')
hdb.fit(df_train[['content', 'wording']])

In [15]:
train_df['cluster'] = hdb.labels_
train_df.cluster = train_df.cluster.astype(str)
train_df['cluster_probs'] = hdb.probabilities_

In [16]:
# Plot clusters in feature space (already 2d so very easy)
fig = px.scatter(train_df, x='content', y='wording', color='cluster', title='Scoring clusters')#, size='cluster_probs', size_max=14)
fig.update_layout(showlegend=False)
fig.show()

# Cluster memberships
fig = px.bar(train_df.cluster.value_counts().reset_index(), x='cluster', y='count', title='Cluster sizes')
fig.show()

# Quality of clusters
train_df.cluster_probs.describe()

count    7165.000000
mean        0.832116
std         0.371548
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: cluster_probs, dtype: float64

In [17]:
def view_text_column(column: str) -> None:
    count_column = f'{column}_wordcount'
    df_train[count_column] = df_train[column].str.split(' ').apply(len)

    fig = px.violin(df_train, x=count_column, title='Distribution of word count')
    fig.update_layout(xaxis_title=column)
    fig.show()

text_columns = ['prompt_title', 'prompt_question', 'prompt_text', 'text']
for col in text_columns:
    view_text_column(col)

The prompt texts can be quite long, up to ~1000 words, and as expected student summarisations tend to be much shorter, arount 50 - 100 words.

In [18]:
fig = px.scatter(df_train, x='text_wordcount', y=['content', 'wording'])
fig.show()

wordcount_cols = [column for column in df_train.columns if 'wordcount' in column]
df_train[[*wordcount_cols, 'content', 'wording']].corr()

Unnamed: 0,prompt_title_wordcount,prompt_question_wordcount,prompt_text_wordcount,text_wordcount,content,wording
prompt_title_wordcount,1.0,0.360163,0.777516,0.115794,0.003555,-0.055743
prompt_question_wordcount,0.360163,1.0,-0.26147,0.142355,0.054521,0.033859
prompt_text_wordcount,0.777516,-0.26147,1.0,0.031264,-0.04701,-0.135258
text_wordcount,0.115794,0.142355,0.031264,1.0,0.785671,0.53154
content,0.003555,0.054521,-0.04701,0.785671,1.0,0.75138
wording,-0.055743,0.033859,-0.135258,0.53154,0.75138,1.0


In [19]:
from sentence_transformers import SentenceTransformer

In [20]:
model = SentenceTransformer('all-MiniLM-L6-v2')


The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.



/home/chris/repositories/student-summary-evaluation/.env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [21]:
model.max_seq_length = 512

In [22]:
text_embeddings = model.encode(df_train['text'], batch_size=128, show_progress_bar=True)

Batches:   0%|          | 0/56 [00:00<?, ?it/s]

In [23]:
text_embeddings.dtype

dtype('float32')

In [24]:
df_train['text_embeddings'] = pd.Series(list(text_embeddings))

In [25]:
df_train.prompt_question

0       Summarize how the Third Wave developed over su...
1       Summarize how the Third Wave developed over su...
2       Summarize how the Third Wave developed over su...
3       Summarize how the Third Wave developed over su...
4       Summarize how the Third Wave developed over su...
                              ...                        
7160    Summarize at least 3 elements of an ideal trag...
7161    Summarize at least 3 elements of an ideal trag...
7162    Summarize at least 3 elements of an ideal trag...
7163    Summarize at least 3 elements of an ideal trag...
7164    Summarize at least 3 elements of an ideal trag...
Name: prompt_question, Length: 7165, dtype: string[pyarrow]