In [1]:
import pandas as pd

In [2]:
df_train_prompt = pd.read_csv('commonlit-evaluate-student-summaries/prompts_train.csv')
print("Full prompt train dataset shape is {}".format(df_train_prompt.shape))

df_train_summaries = pd.read_csv('commonlit-evaluate-student-summaries/summaries_train.csv')
print("Full summaries train dataset shape is {}".format(df_train_summaries.shape))

df_train = df_train_summaries.merge(df_train_prompt, on='prompt_id')
print("Full train dataset shape is {}".format(df_train.shape))

Full prompt train dataset shape is (4, 4)
Full summaries train dataset shape is (7165, 5)
Full train dataset shape is (7165, 8)


In [3]:
df_train.sample(0)

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text


In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


# Assuming df is your DataFrame and 'text' is the column with the text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    
    return lemmatized_text

df_train['prompt_text'] = df_train['prompt_text'].apply(preprocess_text)
df_train['text'] = df_train['text'].apply(preprocess_text)
df_train['prompt_question'] = df_train['prompt_question'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
df_train.sample()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
6819,d23867fac8c4,39c16e,"[aristotle, first, state, tragedy, arranged, simple, complex, plan, mean, plot, interesting, detailed, paragraph, two, go, state, event, tragedy, excite, reader, let, feel, pity, fear, draw, reader, make, interested, paragraph, two, lastly, main, character, must, evil, person, must, also, perfect, citizen, main, character, error, judgement, frailty, evil, paragraph, two, overall, three, aspect, mentioned, plot, elaborate, plot, cause, excitement, intererest, main, hero, neither, evil, perfect]",2.582323,1.576853,"[summarize, least, 3, element, ideal, tragedy, described, aristotle]",On Tragedy,"[chapter, 13, sequel, already, said, must, proceed, consider, poet, aim, avoid, constructing, plot, mean, specific, effect, tragedy, produced, perfect, tragedy, seen, arranged, simple, complex, plan, moreover, imitate, action, excite, pity, fear, distinctive, mark, tragic, imitation, follows, plainly, first, place, change, fortune, presented, must, spectacle, virtuous, man, brought, prosperity, adversity, move, neither, pity, fear, merely, shock, u, bad, man, passing, adversity, prosperity, nothing, alien, spirit, tragedy, posse, single, tragic, quality, neither, satisfies, moral, sense, call, forth, pity, fear, downfall, utter, villain, exhibited, plot, kind, would, doubtless, satisfy, moral, sense, would, inspire, neither, pity, fear, pity, aroused, unmerited, misfortune, fear, misfortune, man, ...]"


In [6]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import torch

# # Load the TinyBERT model
# tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
# model = AutoModelForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")

# # Check if a GPU is available and if not, use a CPU
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # Move the model to the GPU
# model = model.to(device)

# # Assuming df is your DataFrame and 'text', 'question', and 'summary' are the columns with the text
# # 'grade' is the column with the grades
# texts = df_train['prompt_text'].tolist()
# questions = df_train['prompt_question'].tolist()
# summaries = df_train['text'].tolist()
# grades = df_train['content'].tolist()

# # Combine the text, question, and summary into a single string
# combined_texts = [f"{t} {q} {s}" for t, q, s in zip(texts, questions, summaries)]

# inputs = tokenizer(combined_texts, return_tensors='pt', truncation=True, padding=True)
# labels = torch.tensor([grades])  # Grades should be integer values

# # Move the inputs and labels to the GPU
# inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
# labels = labels.to(device)

# outputs = model(**inputs, labels=labels)
# loss = outputs.loss
# logits = outputs.logits


In [7]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Load the TinyBERT model
tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
model = AutoModelForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Check if a GPU is available and if not, use a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the GPU
model = model.to(device)

# Assuming df_train is your DataFrame and 'prompt_text', 'prompt_question', and 'text' are the columns with the text
# 'content' is the column with the grades
texts = df_train['prompt_text'].tolist()
questions = df_train['prompt_question'].tolist()
summaries = df_train['text'].tolist()
grades = df_train['content'].tolist()

# Combine the text, question, and summary into a single string
combined_texts = [f"{t} {q} {s}" for t, q, s in zip(texts, questions, summaries)]

# Create a Dataset
dataset = TextDataset(combined_texts, grades, tokenizer, max_length=512)

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=16)  # Adjust batch size as needed

# Training loop
model.train()
for batch in dataloader:
    inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
    labels = batch['labels'].to(device)
    outputs = model(**inputs, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()  # Don't forget to initialize and step the optimizer!


Some weights of the model checkpoint at huawei-noah/TinyBERT_General_4L_312D were not used when initializing BertForSequenceClassification: ['fit_denses.1.weight', 'cls.predictions.transform.LayerNorm.bias', 'fit_denses.3.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'fit_denses.0.weight', 'fit_denses.1.bias', 'cls.predictions.bias', 'fit_denses.0.bias', 'fit_denses.4.weight', 'fit_denses.2.bias', 'fit_denses.2.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'fit_denses.3.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'fit_denses.4.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a

RuntimeError: unique_by_key: failed to synchronize: cudaErrorAssert: device-side assert triggered

In [None]:
def jaccard_similarity(list1, list2):
    intersection = len(set(list1).intersection(set(list2)))
    union = len(set(list1)) + len(set(list2)) - intersection
    return float(intersection) / union

In [None]:
df_train['similarity'] = df_train.apply(lambda row: jaccard_similarity(row['text'], row['prompt_text']), axis=1)

In [None]:
df_train['similarity'].hist(bins=50)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(df_train['content'], df_train['similarity'])

In [None]:
df_train['content'].corr(df_train['similarity'])