In [1]:
import pandas as pd

from transformers import BertTokenizer
from transformers import BertModel

import torch
import torch.nn as nn
from torch.optim import Adam

from multiprocessing import Pool, cpu_count

In [2]:
# 데이터 로드
train_data = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test_data = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

# BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 토크나이징 함수 정의
def tokenize_essay(essay):
    return tokenizer(essay, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# 병렬 토크나이징 함수 정의
def parallel_tokenize(texts, tokenizer, num_workers=cpu_count()):
    with Pool(num_workers) as p:
        tokenized_texts = p.map(tokenize_essay, texts)
    return tokenized_texts

# 학습 데이터 토크나이징
train_essays = train_data['full_text'].tolist()
tokenized_train_essays = parallel_tokenize(train_essays, tokenizer)

# 테스트 데이터 토크나이징
test_essays = test_data['full_text'].tolist()
tokenized_test_essays = parallel_tokenize(test_essays, tokenizer)

# 병렬로 처리된 데이터를 다시 합치기
train_input_ids = torch.cat([x['input_ids'] for x in tokenized_train_essays])
train_attention_mask = torch.cat([x['attention_mask'] for x in tokenized_train_essays])

test_input_ids = torch.cat([x['input_ids'] for x in tokenized_test_essays])
test_attention_mask = torch.cat([x['attention_mask'] for x in tokenized_test_essays])

# 학습 데이터 레이블 변환
train_labels = torch.tensor(train_data['score'].values, dtype=torch.float32).unsqueeze(1)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
# 학습 데이터 토크나이징 (배치 처리)
train_encodings = tokenizer.batch_encode_plus(
    train_data['full_text'].tolist(),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

# 테스트 데이터 토크나이징 (배치 처리)
test_encodings = tokenizer.batch_encode_plus(
    test_data['full_text'].tolist(),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

# 학습 데이터 입력
train_input_ids = train_encodings['input_ids']
train_attention_mask = train_encodings['attention_mask']

# 테스트 데이터 입력
test_input_ids = test_encodings['input_ids']
test_attention_mask = test_encodings['attention_mask']

# 학습 데이터 레이블 변환
train_labels = torch.tensor(train_data['score'].values, dtype=torch.float32).unsqueeze(1)


In [4]:
class EssayScoringModel(nn.Module):
    def __init__(self):
        super(EssayScoringModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, 1)  

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output

In [5]:
model = EssayScoringModel()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

---

### Simple EDA using `PySpark` and `PySpark-SQL-Functions`

In [6]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | done
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=8a2634fb611bb0d79531a3556534322669011e52d59a45540f9cbba8b8d7be0b
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
[0mInstalling collected packages: pyspark
Successfully installed pyspark-3.5.1


In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import explode
from pyspark.sql.functions import lower, regexp_replace
from pyspark.sql.functions import col, count, when
from pyspark.sql.functions import length

import pandas as pd
import matplotlib.pyplot as plt

In [8]:
# Spark 세션 초기화
spark = SparkSession.builder \
    .appName("something_20240602") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/02 11:25:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
# 데이터 로드
train_df = spark.read.csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv", header=True, inferSchema=True)
test_df = spark.read.csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv", header=True, inferSchema=True)

# 데이터 확인
train_df.show(5)
test_df.show(5)


                                                                                

+---------+--------------------+--------------------+
| essay_id|           full_text|               score|
+---------+--------------------+--------------------+
|  000d118|"Many people have...|but there are onl...|
|  000fe60|"I am a scientist...|                NULL|
|First off| how could it be ...| which means so f...|
|     Next| why it is a land...| and there is als...|
|   Finaly| why you should l...| that the ""face"...|
+---------+--------------------+--------------------+
only showing top 5 rows

+---------+--------------------+
| essay_id|           full_text|
+---------+--------------------+
|  000d118|"Many people have...|
|  000fe60|"I am a scientist...|
|First off| how could it be ...|
|     Next| why it is a land...|
|   Finaly| why you should l...|
+---------+--------------------+
only showing top 5 rows



In [10]:
# 데이터 스키마 확인
train_df.printSchema()
test_df.printSchema()

# 데이터 통계 요약
train_df.describe().show()

root
 |-- essay_id: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- score: string (nullable = true)

root
 |-- essay_id: string (nullable = true)
 |-- full_text: string (nullable = true)





+-------+--------------------+--------------------+--------------------+
|summary|            essay_id|           full_text|               score|
+-------+--------------------+--------------------+--------------------+
|  count|               85860|               70869|               43099|
|   mean|            Infinity|       49.3587890625|  48.622015581804476|
| stddev|                 NaN|    301.071780828543|   298.6170863855692|
|    min|                 ...|                    |                   "|
|    max|“The facial expre...|” says Garvin. “T...|” states Dr. Huan...|
+-------+--------------------+--------------------+--------------------+



                                                                                

In [11]:
# 결측값 확인
train_df.select([count(when(col(c).isNull(), c)).alias(c) for c in train_df.columns]).show()



+--------+---------+-----+
|essay_id|full_text|score|
+--------+---------+-----+
|      78|    15069|42839|
+--------+---------+-----+



                                                                                

In [12]:
# 점수 분포 확인
train_df.groupBy('score').count().orderBy('score').show()



+--------------------+-----+
|               score|count|
+--------------------+-----+
|                NULL|42839|
|                   "|    1|
|  and also gains ...|    1|
|                  ""|    8|
| "" "" A classroo...|    1|
| "" ""A classroom...|    1|
| "" 'A classroom ...|    1|
| "" 'A classroom ...|    1|
| "" 'Besides help...|    1|
| "" 'Most human c...|    1|
| "" 'We have to i...|    1|
| "" (D'Alto 1). T...|    1|
| "" ...(NASA) has...|    1|
| "" 83 percent happy|    1|
| "" A classroom c...|    1|
| "" A classroom c...|    1|
| "" A classroom c...|    1|
| "" A dispute ove...|    1|
| "" A thick atmos...|    1|
| "" According to ...|    1|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [13]:
# 텍스트 길이 계산
train_df = train_df.withColumn('text_length', length(train_df['full_text']))

# 텍스트 길이 통계 요약
train_df.select('text_length').describe().show()

# 텍스트 길이 분포 확인
train_df.groupBy('text_length').count().orderBy('text_length').show()

                                                                                

+-------+------------------+
|summary|       text_length|
+-------+------------------+
|  count|             70869|
|   mean|172.18649903342788|
| stddev| 227.7261211683433|
|    min|                 1|
|    max|             15896|
+-------+------------------+

+-----------+-----+
|text_length|count|
+-----------+-----+
|       NULL|15069|
|          1| 5001|
|          2|   20|
|          3|   70|
|          4|  247|
|          5|  579|
|          6|  478|
|          7|  445|
|          8|  698|
|          9|  543|
|         10|  525|
|         11|  567|
|         12|  358|
|         13|  594|
|         14|  320|
|         15|  353|
|         16|  337|
|         17|  315|
|         18|  367|
|         19|  554|
+-----------+-----+
only showing top 20 rows



In [14]:
# 텍스트 전처리: 소문자 변환 및 특수문자 제거
train_df = train_df.withColumn('clean_text', lower(regexp_replace('full_text', '[^a-zA-Z0-9\s]', '')))
train_df.select('full_text', 'clean_text').show(5)


+--------------------+--------------------+
|           full_text|          clean_text|
+--------------------+--------------------+
|"Many people have...|many people have ...|
|"I am a scientist...|i am a scientist ...|
| how could it be ...| how could it be ...|
| why it is a land...| why it is a land...|
| why you should l...| why you should l...|
+--------------------+--------------------+
only showing top 5 rows

