In [1]:
from pyspark.sql import SparkSession

# Spark 세션 초기화
spark = SparkSession.builder.appName("Ironman Data Analysis_03").getOrCreate()

# CSV 다시 불러오기
file_path = "file:///home/lab12/src/data/ironman_wc_2022.csv"  # 정확한 경로 입력
df = spark.read.csv(file_path, header=True, inferSchema=True)

# 데이터 확인
df.show(truncate=False)
df.printSchema()

24/12/18 15:52:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

+---+--------------------+--------------+------+----+--------+------------+------------+---------+---------+---------+---------+--------+--------+-------------+
|bib|name                |country       |gender|div |div_rank|overall_time|overall_rank|swim_time|swim_rank|bike_time|bike_rank|run_time|run_rank|finish_status|
+---+--------------------+--------------+------+----+--------+------------+------------+---------+---------+---------+---------+--------+--------+-------------+
|8  |Gustav Iden         |Norway        |Male  |MPRO|1       |7:40:24     |1           |48:23:00 |10       |4:11:06  |6        |2:36:15 |1       |Finisher     |
|15 |Sam Laidlow         |France        |Male  |MPRO|2       |7:42:24     |2           |48:16:00 |2        |4:04:36  |1        |2:44:40 |5       |Finisher     |
|1  |Kristian Blummenfelt|Norway        |Male  |MPRO|3       |7:43:23     |3           |48:20:00 |5        |4:11:16  |8        |2:39:21 |2       |Finisher     |
|23 |Max Neumann         |Australi

In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# 1. 필요한 컬럼만 선택
selected_columns = ["gender", "div", "swim_time", "bike_time", "run_time", "overall_time", "finish_status"]
df_selected = df.select(selected_columns)

# 2. 시간 데이터를 초 단위로 변환하는 UDF 정의
def time_to_seconds(time_str):
    if time_str is None or time_str.strip() == "":
        return None  # 결측치는 그대로 처리
    try:
        h, m, s = map(int, time_str.split(':'))
        return h * 3600 + m * 60 + s
    except:
        return None

time_to_seconds_udf = udf(time_to_seconds, IntegerType())

# 3. 시간 데이터를 변환
df_transformed = df_selected.withColumn("swim_seconds", time_to_seconds_udf(df_selected["swim_time"])) \
                            .withColumn("bike_seconds", time_to_seconds_udf(df_selected["bike_time"])) \
                            .withColumn("run_seconds", time_to_seconds_udf(df_selected["run_time"])) \
                            .withColumn("overall_seconds", time_to_seconds_udf(df_selected["overall_time"]))

# 4. 결과 확인
df_transformed.show(truncate=False)


[Stage 3:>                                                          (0 + 1) / 1]

+------+----+---------+---------+--------+------------+-------------+------------+------------+-----------+---------------+
|gender|div |swim_time|bike_time|run_time|overall_time|finish_status|swim_seconds|bike_seconds|run_seconds|overall_seconds|
+------+----+---------+---------+--------+------------+-------------+------------+------------+-----------+---------------+
|Male  |MPRO|48:23:00 |4:11:06  |2:36:15 |7:40:24     |Finisher     |174180      |15066       |9375       |27624          |
|Male  |MPRO|48:16:00 |4:04:36  |2:44:40 |7:42:24     |Finisher     |173760      |14676       |9880       |27744          |
|Male  |MPRO|48:20:00 |4:11:16  |2:39:21 |7:43:23     |Finisher     |174000      |15076       |9561       |27803          |
|Male  |MPRO|48:25:00 |4:11:30  |2:40:14 |7:44:44     |Finisher     |174300      |15090       |9614       |27884          |
|Male  |MPRO|52:55:00 |4:11:11  |2:45:26 |7:54:05     |Finisher     |190500      |15071       |9926       |28445          |
|Male  |

                                                                                

In [3]:
from pyspark.sql.functions import when, col, regexp_extract

# 1. 성별 인코딩 (Male -> 1, Female -> 0)
df_prepared = df_transformed.withColumn(
    "gender_encoded", when(col("gender") == "Male", 1).otherwise(0)
)

# 2. 부문에서 나이대 추출
# 정규식으로 나이대 정보 추출 (예: M30-34 -> 30, M25-29 -> 25)
df_prepared = df_prepared.withColumn(
    "age_group",
    regexp_extract(col("div"), r"(\d+)-", 1).cast("int")
)

# 3. 결측치 확인
null_counts = df_prepared.select(
    [
        col(c).isNull().cast("int").alias(c)
        for c in ["swim_seconds", "bike_seconds", "run_seconds", "overall_seconds", "finish_status"]
    ]
).groupBy().sum()

# 결과 출력
df_prepared.show(truncate=False)
null_counts.show()


+------+----+---------+---------+--------+------------+-------------+------------+------------+-----------+---------------+--------------+---------+
|gender|div |swim_time|bike_time|run_time|overall_time|finish_status|swim_seconds|bike_seconds|run_seconds|overall_seconds|gender_encoded|age_group|
+------+----+---------+---------+--------+------------+-------------+------------+------------+-----------+---------------+--------------+---------+
|Male  |MPRO|48:23:00 |4:11:06  |2:36:15 |7:40:24     |Finisher     |174180      |15066       |9375       |27624          |1             |null     |
|Male  |MPRO|48:16:00 |4:04:36  |2:44:40 |7:42:24     |Finisher     |173760      |14676       |9880       |27744          |1             |null     |
|Male  |MPRO|48:20:00 |4:11:16  |2:39:21 |7:43:23     |Finisher     |174000      |15076       |9561       |27803          |1             |null     |
|Male  |MPRO|48:25:00 |4:11:30  |2:40:14 |7:44:44     |Finisher     |174300      |15090       |9614       

In [11]:
# 결측치 제거
df_cleaned = df_prepared.dropna(subset=["swim_seconds", "bike_seconds", "run_seconds", "overall_seconds", "finish_status"])


In [13]:
# 모든 입력 컬럼을 double로 변환
df_cleaned = df_cleaned.withColumn("gender_encoded", col("gender_encoded").cast("double")) \
                       .withColumn("age_group", col("age_group").cast("double")) \
                       .withColumn("swim_seconds", col("swim_seconds").cast("double")) \
                       .withColumn("bike_seconds", col("bike_seconds").cast("double")) \
                       .withColumn("run_seconds", col("run_seconds").cast("double")) \
                       .withColumn("overall_seconds", col("overall_seconds").cast("double"))


In [14]:
# null 값 확인
df_cleaned.select([col(c).isNull().alias(c) for c in features]).show()


+--------------+---------+------------+------------+-----------+---------------+
|gender_encoded|age_group|swim_seconds|bike_seconds|run_seconds|overall_seconds|
+--------------+---------+------------+------------+-----------+---------------+
|         false|     true|       false|       false|      false|          false|
|         false|     true|       false|       false|      false|          false|
|         false|     true|       false|       false|      false|          false|
|         false|     true|       false|       false|      false|          false|
|         false|     true|       false|       false|      false|          false|
|         false|     true|       false|       false|      false|          false|
|         false|     true|       false|       false|      false|          false|
|         false|     true|       false|       false|      false|          false|
|         false|     true|       false|       false|      false|          false|
|         false|     true|  

In [20]:
df_cleaned = df_cleaned.dropna()


In [23]:
from pyspark.sql.functions import when, col
from pyspark.ml.feature import VectorAssembler

# 1. 결측치 제거
df_cleaned = df_prepared.dropna(subset=["swim_seconds", "bike_seconds", "run_seconds", "overall_seconds", "finish_status"])

# 2. 데이터 타입 변환 및 `finish_status_encoded` 생성
df_cleaned = df_cleaned.withColumn("gender_encoded", col("gender_encoded").cast("double")) \
                       .withColumn("age_group", col("age_group").cast("double")) \
                       .withColumn("swim_seconds", col("swim_seconds").cast("double")) \
                       .withColumn("bike_seconds", col("bike_seconds").cast("double")) \
                       .withColumn("run_seconds", col("run_seconds").cast("double")) \
                       .withColumn("overall_seconds", col("overall_seconds").cast("double")) \
                       .withColumn("finish_status_encoded", when(col("finish_status") == "Finisher", 1).otherwise(0))

# 3. 데이터 불균형 확인
df_cleaned.groupBy("finish_status_encoded").count().show()





+---------------------+-----+
|finish_status_encoded|count|
+---------------------+-----+
|                    1| 2376|
+---------------------+-----+



In [17]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# 1. 모델 생성
lr = LogisticRegression(featuresCol="features", labelCol="finish_status_encoded")

# 2. 모델 학습
lr_model = lr.fit(train_data)

# 3. 테스트 데이터로 예측
predictions = lr_model.transform(test_data)

# 4. 평가
evaluator = BinaryClassificationEvaluator(labelCol="finish_status_encoded", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)

# 결과 출력
print(f"ROC-AUC: {roc_auc:.2f}")
predictions.select("features", "finish_status_encoded", "prediction", "probability").show()


24/12/18 16:38:15 WARN Instrumentation: [00857140] All labels are the same value and fitIntercept=true, so the coefficients will be zeros. Training is not needed.


ROC-AUC: 1.00
+--------------------+---------------------+----------+-----------+
|            features|finish_status_encoded|prediction|probability|
+--------------------+---------------------+----------+-----------+
|[1.0,18.0,3717.0,...|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,3776.0,...|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,3855.0,...|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,3943.0,...|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,4134.0,...|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,4212.0,...|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,4340.0,...|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,4702.0,...|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,193380....|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,193620....|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,193740....|                    1|       1.0|  [0.0,1.0]|
|[1.0,18.0,201420....|            

In [24]:
spark.stop()