In [1]:
from pyspark.sql import SparkSession

# Spark 세션 초기화
spark = SparkSession.builder.appName("Ironman Data Analysis_02").getOrCreate()

# CSV 다시 불러오기
file_path = "file:///home/lab12/src/data/2024_ironman.csv"  # 정확한 경로 입력
df = spark.read.csv(file_path, header=True, inferSchema=True)

# 데이터 확인
df.show(truncate=False)
df.printSchema()

24/12/18 15:01:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

+---+-----------------+-------------+------+--------+-------------+------------+------------+---------+---------+---------+---------+--------+--------+-------------+
|Bib|Name             |Country      |Gender|Division|Division Rank|Overall Time|Overall Rank|Swim Time|Swim Rank|Bike Time|Bike Rank|Run Time|Run Rank|Finish Status|
+---+-----------------+-------------+------+--------+-------------+------------+------------+---------+---------+---------+---------+--------+--------+-------------+
|284|Sergei Khazov    |Kazakhstan   |Male  |M40-44  |1            |8:43:25     |1           |56:20:00 |3        |4:32:10  |3        |3:09:03 |3       |Finisher     |
|6  |Sanghwan Oh      |Korea        |Male  |M50-54  |1            |8:52:37     |2           |1:15:00  |54       |4:20:06  |2        |3:11:19 |4       |Finisher     |
|11 |Brenteson John   |United States|Male  |M30-34  |1            |9:06:06     |3           |1:00:20  |6        |4:43:08  |6        |3:17:23 |6       |Finisher     |
|42 

In [49]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# 시간 문자열을 초 단위로 변환하는 함수
def time_to_seconds(time_str):
    if time_str:
        h, m, s = map(int, time_str.split(":"))
        return h * 3600 + m * 60 + s
    return None

# UDF 등록
time_udf = udf(time_to_seconds, IntegerType())

# 시간 컬럼 변환
time_cols = ["Swim Time", "Bike Time", "Run Time", "Overall Time"]
for col in time_cols:
    df = df.withColumn(col, time_udf(df[col]))

# 변환 결과 확인
df.select(time_cols).show(truncate=False)


In [50]:
from pyspark.ml.feature import StringIndexer

# 문자열 컬럼을 인덱스로 변환
indexers = [
    StringIndexer(inputCol="Gender", outputCol="Gender_Index"),
    StringIndexer(inputCol="Division", outputCol="Division_Index"),
    StringIndexer(inputCol="Finish Status", outputCol="Finish_Status_Index")
]

# StringIndexer를 적용하여 데이터 변환
for indexer in indexers:
    df = indexer.fit(df).transform(df)

# 변환된 데이터 확인
df.select("Gender", "Gender_Index", "Division", "Division_Index", "Finish Status", "Finish_Status_Index").show()


IllegalArgumentException: requirement failed: Output column Gender_Index already exists.

In [4]:
from pyspark.ml.feature import VectorAssembler

# feature 컬럼 정의
feature_cols = ["Swim Time", "Bike Time", "Run Time", "Gender_Index", "Division_Index"]

# VectorAssembler로 feature 벡터 생성
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

# feature와 label 확인
df.select("features", "Finish_Status_Index").show(truncate=False)


+----------------------------------+-------------------+
|features                          |Finish_Status_Index|
+----------------------------------+-------------------+
|[202800.0,16330.0,11343.0,0.0,1.0]|0.0                |
|[4500.0,15606.0,11479.0,0.0,2.0]  |0.0                |
|[3620.0,16988.0,11843.0,0.0,6.0]  |0.0                |
|[3953.0,17581.0,11888.0,0.0,3.0]  |0.0                |
|[4100.0,16978.0,12360.0,0.0,12.0] |0.0                |
|[3914.0,17221.0,12350.0,0.0,3.0]  |0.0                |
|[202980.0,18517.0,11999.0,1.0,9.0]|0.0                |
|[4035.0,17497.0,12528.0,0.0,3.0]  |0.0                |
|[4685.0,16890.0,12786.0,0.0,1.0]  |0.0                |
|[4216.0,17272.0,12884.0,0.0,1.0]  |0.0                |
|[4568.0,18611.0,11773.0,0.0,3.0]  |0.0                |
|[4226.0,18403.0,12561.0,0.0,1.0]  |0.0                |
|[4540.0,17638.0,13072.0,0.0,3.0]  |0.0                |
|[4879.0,18235.0,12675.0,0.0,3.0]  |0.0                |
|[4250.0,18753.0,12972.0,1.0,15

In [6]:
# null 값 처리: 평균값으로 대체하거나 제거
from pyspark.sql.functions import col, mean

# 각 feature 컬럼의 평균값으로 null 대체
for col_name in ["Swim Time", "Bike Time", "Run Time"]:
    mean_value = df.select(mean(col(col_name))).collect()[0][0]
    df = df.fillna({col_name: mean_value})

# 다른 컬럼들도 null 값 확인 후 대체
df = df.na.fill({"Gender_Index": 0.0, "Division_Index": 0.0})


In [13]:
assembler = VectorAssembler(
    inputCols=["Swim Time", "Bike Time", "Run Time", "Gender_Index", "Division_Index"],
    outputCol="new_features",  # 새로운 이름 지정
    handleInvalid="skip"
)

df = assembler.transform(df)


In [15]:
df.select("features").show(truncate=False)

+----------------------------------+
|features                          |
+----------------------------------+
|[202800.0,16330.0,11343.0,0.0,1.0]|
|[4500.0,15606.0,11479.0,0.0,2.0]  |
|[3620.0,16988.0,11843.0,0.0,6.0]  |
|[3953.0,17581.0,11888.0,0.0,3.0]  |
|[4100.0,16978.0,12360.0,0.0,12.0] |
|[3914.0,17221.0,12350.0,0.0,3.0]  |
|[202980.0,18517.0,11999.0,1.0,9.0]|
|[4035.0,17497.0,12528.0,0.0,3.0]  |
|[4685.0,16890.0,12786.0,0.0,1.0]  |
|[4216.0,17272.0,12884.0,0.0,1.0]  |
|[4568.0,18611.0,11773.0,0.0,3.0]  |
|[4226.0,18403.0,12561.0,0.0,1.0]  |
|[4540.0,17638.0,13072.0,0.0,3.0]  |
|[4879.0,18235.0,12675.0,0.0,3.0]  |
|[4250.0,18753.0,12972.0,1.0,15.0] |
|[4211.0,18860.0,12955.0,0.0,3.0]  |
|[4011.0,17565.0,14673.0,0.0,0.0]  |
|[191700.0,18746.0,14163.0,0.0,3.0]|
|[4861.0,19610.0,12134.0,0.0,2.0]  |
|[4450.0,19067.0,13515.0,0.0,0.0]  |
+----------------------------------+
only showing top 20 rows



In [16]:
# 데이터셋 분리 (80% 학습, 20% 테스트)
train, test = df.randomSplit([0.8, 0.2], seed=42)

# 데이터 크기 확인
print(f"Train dataset size: {train.count()}")
print(f"Test dataset size: {test.count()}")


Train dataset size: 600
Test dataset size: 115


In [21]:
from pyspark.ml.classification import RandomForestClassifier

# 1. RandomForestClassifier 모델 설정
rf_classifier = RandomForestClassifier(
    labelCol="Finish_Status_Index",  # 타겟 값 (완주 여부)
    featuresCol="features",          # 학습에 사용될 피처
    numTrees=100,                    # 트리 개수
    maxDepth=5,                      # 트리의 최대 깊이
    seed=42                          # 재현성을 위한 랜덤 시드
)

# 2. 모델 학습
model = rf_classifier.fit(train)

# 3. 테스트 데이터로 예측
predictions = model.transform(test)

# 4. 예측 결과 확인
predictions.select("features", "Finish_Status_Index", "prediction", "probability").show(truncate=False)


+----------------------------------+-------------------+----------+-----------------------+
|features                          |Finish_Status_Index|prediction|probability            |
+----------------------------------+-------------------+----------+-----------------------+
|[4250.0,18753.0,12972.0,1.0,15.0] |0.0                |0.0       |[1.0,0.0,0.0,0.0,0.0]  |
|[194640.0,20049.0,16236.0,0.0,4.0]|0.0                |0.0       |[0.98,0.02,0.0,0.0,0.0]|
|[5879.0,18850.0,17612.0,0.0,14.0] |0.0                |0.0       |[1.0,0.0,0.0,0.0,0.0]  |
|[4433.0,20703.0,15461.0,0.0,3.0]  |0.0                |0.0       |[1.0,0.0,0.0,0.0,0.0]  |
|[4226.0,18403.0,12561.0,0.0,1.0]  |0.0                |0.0       |[1.0,0.0,0.0,0.0,0.0]  |
|[7276.0,22895.0,18066.0,0.0,0.0]  |1.0                |1.0       |[0.0,1.0,0.0,0.0,0.0]  |
|[4872.0,19360.0,13485.0,0.0,4.0]  |0.0                |0.0       |[1.0,0.0,0.0,0.0,0.0]  |
|[4035.0,17497.0,12528.0,0.0,3.0]  |0.0                |0.0       |[1.0,0.0,0.0,

In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 다중 클래스 분류 평가기 사용
evaluator = MulticlassClassificationEvaluator(labelCol="Finish_Status_Index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")


Accuracy: 1.0


In [44]:
new_data = [(5000.0, 18000.0, 13000.0, 0.0, 1.0)]
columns = ["Swim Time", "Bike Time", "Run Time", "Gender_Index", "Division_Index"]
new_df = spark.createDataFrame(new_data, columns)


In [45]:
new_features = assembler.transform(new_df)


In [46]:
prediction = model.transform(new_features)
prediction.select("features", "prediction", "probability").show(truncate=False)


+--------------------------------+----------+---------------------+
|features                        |prediction|probability          |
+--------------------------------+----------+---------------------+
|[5000.0,18000.0,13000.0,0.0,1.0]|0.0       |[1.0,0.0,0.0,0.0,0.0]|
+--------------------------------+----------+---------------------+



In [48]:
from pyspark.ml.feature import VectorAssembler

# VectorAssembler를 사용해 features 열 추가
assembler = VectorAssembler(
    inputCols=["Swim Time", "Bike Time", "Run Time", "Gender_Index", "Division_Index"],
    outputCol="features"
)

# features 열 생성
test_df_with_features = assembler.transform(test_df)

# 모델 예측
test_predictions = model.transform(test_df_with_features)

# 결과 확인
test_predictions.select("features", "prediction", "probability").show(truncate=False)


+---------------------------------+----------+---------------------+
|features                         |prediction|probability          |
+---------------------------------+----------+---------------------+
|[4000.0,18000.0,13000.0,0.0,5.0] |0.0       |[1.0,0.0,0.0,0.0,0.0]|
|[3500.0,20000.0,15000.0,1.0,8.0] |0.0       |[1.0,0.0,0.0,0.0,0.0]|
|[5000.0,22000.0,14000.0,0.0,3.0] |0.0       |[1.0,0.0,0.0,0.0,0.0]|
|[4800.0,21000.0,16000.0,1.0,10.0]|0.0       |[1.0,0.0,0.0,0.0,0.0]|
|[3900.0,19000.0,12000.0,0.0,7.0] |0.0       |[1.0,0.0,0.0,0.0,0.0]|
|[4100.0,25000.0,17000.0,1.0,6.0] |0.0       |[1.0,0.0,0.0,0.0,0.0]|
|[4300.0,23000.0,15000.0,0.0,4.0] |0.0       |[1.0,0.0,0.0,0.0,0.0]|
|[4700.0,24000.0,16000.0,1.0,2.0] |0.0       |[1.0,0.0,0.0,0.0,0.0]|
|[4200.0,20000.0,14000.0,0.0,9.0] |0.0       |[1.0,0.0,0.0,0.0,0.0]|
|[3900.0,22000.0,15000.0,1.0,1.0] |0.0       |[1.0,0.0,0.0,0.0,0.0]|
+---------------------------------+----------+---------------------+



In [51]:
spark.stop()