In [2]:
from pyspark.sql import SparkSession

# Spark 세션 초기화
spark = SparkSession.builder.appName("Ironman Data Analysis_04").getOrCreate()

# CSV 다시 불러오기
file_path = "file:///home/lab12/src/data/ironman_wc_2022.csv"  # 정확한 경로 입력
df = spark.read.csv(file_path, header=True, inferSchema=True)

# 데이터 확인
df.show(truncate=False)
df.printSchema()

24/12/18 16:56:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

+---+--------------------+--------------+------+----+--------+------------+------------+---------+---------+---------+---------+--------+--------+-------------+
|bib|name                |country       |gender|div |div_rank|overall_time|overall_rank|swim_time|swim_rank|bike_time|bike_rank|run_time|run_rank|finish_status|
+---+--------------------+--------------+------+----+--------+------------+------------+---------+---------+---------+---------+--------+--------+-------------+
|8  |Gustav Iden         |Norway        |Male  |MPRO|1       |7:40:24     |1           |48:23:00 |10       |4:11:06  |6        |2:36:15 |1       |Finisher     |
|15 |Sam Laidlow         |France        |Male  |MPRO|2       |7:42:24     |2           |48:16:00 |2        |4:04:36  |1        |2:44:40 |5       |Finisher     |
|1  |Kristian Blummenfelt|Norway        |Male  |MPRO|3       |7:43:23     |3           |48:20:00 |5        |4:11:16  |8        |2:39:21 |2       |Finisher     |
|23 |Max Neumann         |Australi

In [4]:
from pyspark.sql.functions import when, col, split

# 1. 시간 데이터를 초 단위로 변환하는 함수 정의
def time_to_seconds(time_str_col):
    """
    문자열 형태의 시간 데이터를 초 단위로 변환
    """
    return (split(time_str_col, ":")[0].cast("int") * 3600 +
            split(time_str_col, ":")[1].cast("int") * 60 +
            split(time_str_col, ":")[2].cast("int"))

# 2. 초 단위 컬럼 생성
df = df.withColumn("swim_seconds", time_to_seconds(col("swim_time"))) \
       .withColumn("bike_seconds", time_to_seconds(col("bike_time"))) \
       .withColumn("run_seconds", time_to_seconds(col("run_time"))) \
       .withColumn("overall_seconds", time_to_seconds(col("overall_time")))

# 3. 컷오프 기준에 따른 DNF 컬럼 생성
df = df.withColumn(
    "DNF",
    when((col("swim_seconds") > 2 * 3600 + 20 * 60) |  # 수영 컷오프
         (col("swim_seconds") + col("bike_seconds") > 10 * 3600 + 30 * 60) |  # 수영 + 사이클 컷오프
         (col("overall_seconds") > 17 * 3600), 1).otherwise(0)  # 전체 컷오프
)

# 4. 결과 확인
df.select("swim_seconds", "bike_seconds", "run_seconds", "overall_seconds", "DNF").show()


+------------+------------+-----------+---------------+---+
|swim_seconds|bike_seconds|run_seconds|overall_seconds|DNF|
+------------+------------+-----------+---------------+---+
|      174180|       15066|       9375|          27624|  1|
|      173760|       14676|       9880|          27744|  1|
|      174000|       15076|       9561|          27803|  1|
|      174300|       15090|       9614|          27884|  1|
|      190500|       15071|       9926|          28445|  1|
|      190680|       14951|      10125|          28540|  1|
|      190440|       14945|      10168|          28552|  1|
|      179340|       15218|      10091|          28598|  1|
|      179400|       15314|       9960|          28618|  1|
|      178920|       15712|       9719|          28700|  1|
|      190260|       14944|      10467|          28851|  1|
|      173700|       15478|      10229|          28913|  1|
|      174180|       15210|      10563|          28978|  1|
|      174360|       15779|      10023| 

In [6]:
# 1. DQ 데이터 제거
df = df.filter(col("finish_status") != "DQ")

# 2. 완주 여부 컬럼 생성 (DNF=0, Finisher=1)
df = df.withColumn(
    "finish_status_encoded",
    when(col("finish_status") == "DNF", 0).otherwise(1)
)

# 3. 결과 확인
df.groupBy("finish_status_encoded").count().show()



+---------------------+-----+
|finish_status_encoded|count|
+---------------------+-----+
|                    1| 2376|
|                    0|   70|
+---------------------+-----+



In [10]:
from pyspark.sql.functions import regexp_extract, col

# 1. div 컬럼에서 나이대 추출
df = df.withColumn("age_group", regexp_extract(col("div"), r"(\d+)-", 1).cast("int"))

# 2. null 값 처리 (age_group이 없는 경우 기본값 0으로 대체)
df = df.fillna({"age_group": 0})

# 3. 결과 확인
df.select("div", "age_group").distinct().show(truncate=False)


+------+---------+
|div   |age_group|
+------+---------+
|M30-34|30       |
|M25-29|25       |
|M18-24|18       |
|M35-39|35       |
|M40-44|40       |
|M55-59|55       |
|M45-49|45       |
|MPRO  |0        |
|M50-54|50       |
+------+---------+



In [11]:
# MPRO 처리 없이 나머지 데이터 유지
df.select("div", "age_group").distinct().show()


+------+---------+
|   div|age_group|
+------+---------+
|M30-34|       30|
|M25-29|       25|
|M18-24|       18|
|M35-39|       35|
|M40-44|       40|
|M55-59|       55|
|M45-49|       45|
|  MPRO|        0|
|M50-54|       50|
+------+---------+



In [12]:
from pyspark.ml.feature import VectorAssembler

# Feature 벡터 생성
assembler = VectorAssembler(
    inputCols=["gender_encoded", "age_group", "swim_seconds", "bike_seconds", "run_seconds", "overall_seconds"],
    outputCol="features"
)

df_final = assembler.transform(df).select("features", "finish_status_encoded")

# 결과 확인
df_final.show(truncate=False)



+------------------------------------------+---------------------+
|features                                  |finish_status_encoded|
+------------------------------------------+---------------------+
|[1.0,0.0,174180.0,15066.0,9375.0,27624.0] |1                    |
|[1.0,0.0,173760.0,14676.0,9880.0,27744.0] |1                    |
|[1.0,0.0,174000.0,15076.0,9561.0,27803.0] |1                    |
|[1.0,0.0,174300.0,15090.0,9614.0,27884.0] |1                    |
|[1.0,0.0,190500.0,15071.0,9926.0,28445.0] |1                    |
|[1.0,0.0,190680.0,14951.0,10125.0,28540.0]|1                    |
|[1.0,0.0,190440.0,14945.0,10168.0,28552.0]|1                    |
|[1.0,0.0,179340.0,15218.0,10091.0,28598.0]|1                    |
|[1.0,0.0,179400.0,15314.0,9960.0,28618.0] |1                    |
|[1.0,0.0,178920.0,15712.0,9719.0,28700.0] |1                    |
|[1.0,0.0,190260.0,14944.0,10467.0,28851.0]|1                    |
|[1.0,0.0,173700.0,15478.0,10229.0,28913.0]|1                 

In [14]:
df.select(
    col("gender_encoded").isNull().alias("gender_encoded_null"),
    col("age_group").isNull().alias("age_group_null"),
    col("swim_seconds").isNull().alias("swim_seconds_null"),
    col("bike_seconds").isNull().alias("bike_seconds_null"),
    col("run_seconds").isNull().alias("run_seconds_null"),
    col("overall_seconds").isNull().alias("overall_seconds_null")
).groupBy(
    "gender_encoded_null", "age_group_null", "swim_seconds_null", 
    "bike_seconds_null", "run_seconds_null", "overall_seconds_null"
).count().show()


+-------------------+--------------+-----------------+-----------------+----------------+--------------------+-----+
|gender_encoded_null|age_group_null|swim_seconds_null|bike_seconds_null|run_seconds_null|overall_seconds_null|count|
+-------------------+--------------+-----------------+-----------------+----------------+--------------------+-----+
|              false|         false|            false|            false|           false|               false| 2376|
|              false|         false|             true|             true|            true|                true|    4|
|              false|         false|            false|             true|            true|                true|   38|
|              false|         false|            false|            false|            true|                true|   28|
+-------------------+--------------+-----------------+-----------------+----------------+--------------------+-----+



In [15]:
from pyspark.sql.functions import mean

# 1. 모든 시간이 null인 행 제거
df = df.filter(~(
    col("swim_seconds").isNull() & 
    col("bike_seconds").isNull() & 
    col("run_seconds").isNull() & 
    col("overall_seconds").isNull()
))

# 2. 나머지 null 값을 평균값으로 대체
avg_values = df.select(
    mean("swim_seconds").alias("avg_swim"),
    mean("bike_seconds").alias("avg_bike"),
    mean("run_seconds").alias("avg_run"),
    mean("overall_seconds").alias("avg_overall")
).first()

df = df.fillna({
    "swim_seconds": avg_values["avg_swim"],
    "bike_seconds": avg_values["avg_bike"],
    "run_seconds": avg_values["avg_run"],
    "overall_seconds": avg_values["avg_overall"]
})

# 처리 결과 확인
df.select(
    col("swim_seconds").isNull().alias("swim_seconds_null"),
    col("bike_seconds").isNull().alias("bike_seconds_null"),
    col("run_seconds").isNull().alias("run_seconds_null"),
    col("overall_seconds").isNull().alias("overall_seconds_null")
).groupBy(
    "swim_seconds_null", "bike_seconds_null", "run_seconds_null", "overall_seconds_null"
).count().show()


+-----------------+-----------------+----------------+--------------------+-----+
|swim_seconds_null|bike_seconds_null|run_seconds_null|overall_seconds_null|count|
+-----------------+-----------------+----------------+--------------------+-----+
|            false|            false|           false|               false| 2442|
+-----------------+-----------------+----------------+--------------------+-----+



In [17]:
from pyspark.ml.feature import VectorAssembler

# Feature 벡터 생성
assembler = VectorAssembler(
    inputCols=["gender_encoded", "age_group", "swim_seconds", "bike_seconds", "run_seconds", "overall_seconds"],
    outputCol="features"
)

df_final = assembler.transform(df).select("features", "finish_status_encoded")

# 결과 확인
df_final.show(truncate=False)


+------------------------------------------+---------------------+
|features                                  |finish_status_encoded|
+------------------------------------------+---------------------+
|[1.0,0.0,174180.0,15066.0,9375.0,27624.0] |1                    |
|[1.0,0.0,173760.0,14676.0,9880.0,27744.0] |1                    |
|[1.0,0.0,174000.0,15076.0,9561.0,27803.0] |1                    |
|[1.0,0.0,174300.0,15090.0,9614.0,27884.0] |1                    |
|[1.0,0.0,190500.0,15071.0,9926.0,28445.0] |1                    |
|[1.0,0.0,190680.0,14951.0,10125.0,28540.0]|1                    |
|[1.0,0.0,190440.0,14945.0,10168.0,28552.0]|1                    |
|[1.0,0.0,179340.0,15218.0,10091.0,28598.0]|1                    |
|[1.0,0.0,179400.0,15314.0,9960.0,28618.0] |1                    |
|[1.0,0.0,178920.0,15712.0,9719.0,28700.0] |1                    |
|[1.0,0.0,190260.0,14944.0,10467.0,28851.0]|1                    |
|[1.0,0.0,173700.0,15478.0,10229.0,28913.0]|1                 

In [18]:
# 훈련 및 테스트 데이터 분리
train_data, test_data = df_final.randomSplit([0.8, 0.2], seed=42)

# 데이터 크기 확인
print("훈련 데이터 크기:", train_data.count())
print("테스트 데이터 크기:", test_data.count())


훈련 데이터 크기: 2004
테스트 데이터 크기: 438


In [19]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Logistic Regression 모델 생성 및 학습
lr = LogisticRegression(featuresCol="features", labelCol="finish_status_encoded")
lr_model = lr.fit(train_data)

# 테스트 데이터로 예측
predictions = lr_model.transform(test_data)

# ROC-AUC 평가
evaluator = BinaryClassificationEvaluator(labelCol="finish_status_encoded", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)

# 결과 출력
print(f"ROC-AUC: {roc_auc:.2f}")
predictions.select("features", "finish_status_encoded", "prediction", "probability").show(truncate=False)


24/12/18 17:09:18 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/18 17:09:18 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


ROC-AUC: 0.48
+-------------------------------------------+---------------------+----------+-----------------------------------------+
|features                                   |finish_status_encoded|prediction|probability                              |
+-------------------------------------------+---------------------+----------+-----------------------------------------+
|[1.0,0.0,173880.0,15294.0,11157.0,29621.0] |1                    |1.0       |[0.03598355283431645,0.9640164471656836] |
|[1.0,0.0,174000.0,15806.0,10809.0,29796.0] |1                    |1.0       |[0.03902208296993068,0.9609779170300693] |
|[1.0,0.0,174180.0,15066.0,9375.0,27624.0]  |1                    |1.0       |[0.03552577338897682,0.9644742266110232] |
|[1.0,0.0,174300.0,19537.0,15172.0,39636.0] |0                    |1.0       |[0.04032886603580138,0.9596711339641986] |
|[1.0,0.0,178440.0,15955.0,15172.0,39636.0] |0                    |1.0       |[0.00850189787448286,0.9914981021255171] |
|[1.0,0.0,178920.0

In [20]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

# 1. 임의 데이터 생성 완주가 불가능할 데이터도 포함해서 넣어보기로함
# 각 데이터는 [gender_encoded, age_group, swim_seconds, bike_seconds, run_seconds, overall_seconds]
test_data_manual = [
    Row(features=Vectors.dense([1.0, 30.0, 8000.0, 15000.0, 10000.0, 33000.0])),  # 완주 가능
    Row(features=Vectors.dense([0.0, 25.0, 8500.0, 15500.0, 11000.0, 35000.0])),  # 완주 가능
    Row(features=Vectors.dense([1.0, 40.0, 9000.0, 16000.0, 11500.0, 36500.0])),  # 완주 가능
    Row(features=Vectors.dense([0.0, 35.0, 9000.0, 18000.0, 14000.0, 41000.0])),  # 완주 불가능 (전체 초과)
    Row(features=Vectors.dense([1.0, 50.0, 12000.0, 20000.0, 15000.0, 47000.0])),  # 완주 불가능 (전체 초과)
    Row(features=Vectors.dense([1.0, 20.0, 9000.0, 16000.0, 11500.0, 36500.0])),  # 완주 가능
    Row(features=Vectors.dense([0.0, 28.0, 7200.0, 14000.0, 9500.0, 30700.0])),   # 완주 가능
    Row(features=Vectors.dense([1.0, 32.0, 8500.0, 18000.0, 14500.0, 41000.0])),  # 완주 불가능 (전체 초과)
    Row(features=Vectors.dense([0.0, 45.0, 9000.0, 20000.0, 15000.0, 47000.0])),  # 완주 불가능 (전체 초과)
    Row(features=Vectors.dense([1.0, 18.0, 7500.0, 13000.0, 9000.0, 29500.0])),   # 완주 가능
]

# 2. 임의 데이터를 DataFrame으로 변환
test_df_manual = spark.createDataFrame(test_data_manual)

# 3. 모델로 예측
predictions_manual = lr_model.transform(test_df_manual)

# 4. 결과 출력
predictions_manual.select("features", "prediction", "probability").show(truncate=False)


+------------------------------------------+----------+------------------------------------------+
|features                                  |prediction|probability                               |
+------------------------------------------+----------+------------------------------------------+
|[1.0,30.0,8000.0,15000.0,10000.0,33000.0] |1.0       |[0.005010171809519066,0.994989828190481]  |
|[0.0,25.0,8500.0,15500.0,11000.0,35000.0] |1.0       |[0.004845942754198488,0.9951540572458015] |
|[1.0,40.0,9000.0,16000.0,11500.0,36500.0] |1.0       |[0.003975598217536807,0.9960244017824632] |
|[0.0,35.0,9000.0,18000.0,14000.0,41000.0] |1.0       |[0.00550199619540738,0.9944980038045926]  |
|[1.0,50.0,12000.0,20000.0,15000.0,47000.0]|1.0       |[0.002806643511658182,0.9971933564883418] |
|[1.0,20.0,9000.0,16000.0,11500.0,36500.0] |1.0       |[0.00472168102266512,0.9952783189773349]  |
|[0.0,28.0,7200.0,14000.0,9500.0,30700.0]  |1.0       |[0.005488540517465433,0.9945114594825346] |
|[1.0,32.0

In [None]:
# 말도안되는 기록으로 넣어도 완주를 예측하는 모델이 나옴... 처음부터 다시 꽦...

In [21]:
spark.stop()