# 4. Feature Engineering

   - Explore potential features from the text, such as summary length, unique word count, etc.
   - Analyze prompt texts to see if they can offer additional features.
   - Discuss and implement feature extraction methods together.


## Previous codes

In [2]:
import pandas as pd
from transformers import BertTokenizer


# Load datasets
prompts_test = pd.read_csv("../data/prompts_test.csv")
prompts_train = pd.read_csv("../data/prompts_train.csv")
summaries_test = pd.read_csv("../data/summaries_test.csv")
summaries_train = pd.read_csv("../data/summaries_train.csv")

# Drop student_id column from summaries_train and summaries_test
summaries_train = summaries_train.drop(columns=['student_id'])
summaries_test = summaries_test.drop(columns=['student_id'])

id_mapping = {id_val: idx for idx, id_val in enumerate(prompts_train['prompt_id'].unique())}

summaries_train['prompt_id'] = summaries_train['prompt_id'].replace(id_mapping)
summaries_test['prompt_id'] = summaries_test['prompt_id'].replace(id_mapping)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the 'text' column
texts = summaries_train['text'].tolist()
tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=128)

In [7]:
summaries_train['summary_length'] = summaries_train['text'].apply(lambda x: len(x.split()))
print(summaries_train['summary_length'].head())

0     61
1     52
2    235
3     25
4    203
Name: summary_length, dtype: int64


In [8]:
summaries_train['unique_word_count'] = summaries_train['text'].apply(lambda x: len(set(x.split())))
print(summaries_train['unique_word_count'].head())

0     51
1     38
2    149
3     19
4    138
Name: unique_word_count, dtype: int64


In [12]:
summaries_train['avg_sentence_length'] = summaries_train['text'].apply(lambda x: sum(len(sentence.split()) for sentence in x.split('.')) / len(x.split('.')))
print(summaries_train['avg_sentence_length'].head())

0    15.250000
1    17.333333
2    18.000000
3     5.600000
4    14.500000
Name: avg_sentence_length, dtype: float64


In [9]:
summaries_train['avg_word_length'] = summaries_train['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))
print(summaries_train['avg_word_length'].head())

0    4.688525
1    3.711538
2    4.834043
3    5.320000
4    5.024631
Name: avg_word_length, dtype: float64


## Correlation Analysis

Strong positive correlation : Length of sentence.

Weak positive correlation : Length of words.

In [16]:
# Loop through each unique prompt_id and calculate/print the correlation matrix
for prompt_id in summaries_train['prompt_id'].unique():
    subset = summaries_train[summaries_train['prompt_id'] == prompt_id]
    correlation_matrix = subset.corr()
    
    print(f"Correlation Matrix for prompt_id: {prompt_id}")
    print(correlation_matrix)
    print("-" * 50)  # a separator line for clarity


Correlation Matrix for prompt_id: 2
                     prompt_id   content   wording  summary_length  \
prompt_id                  NaN       NaN       NaN             NaN   
content                    NaN  1.000000  0.813791        0.832240   
wording                    NaN  0.813791  1.000000        0.615885   
summary_length             NaN  0.832240  0.615885        1.000000   
unique_word_count          NaN  0.856020  0.648615        0.980010   
avg_word_length            NaN  0.241180  0.185428        0.145148   
avg_sentence_length        NaN  0.133283 -0.000374        0.268629   

                     unique_word_count  avg_word_length  avg_sentence_length  
prompt_id                          NaN              NaN                  NaN  
content                       0.856020         0.241180             0.133283  
wording                       0.648615         0.185428            -0.000374  
summary_length                0.980010         0.145148             0.268629  
unique_w

  correlation_matrix = subset.corr()
  correlation_matrix = subset.corr()
  correlation_matrix = subset.corr()
  correlation_matrix = subset.corr()


## Multi-taks candidates

Readability Scores

Grammatical Errors

Dependency Parsing:
Analyze sentence structures to see if certain patterns are more common in high-scoring responses.