In [1]:
from pyspark.sql import SparkSession

# Spark 세션 초기화
spark = SparkSession.builder.appName("Ironman Data Analysis_241219_02").getOrCreate()

# CSV 다시 불러오기
file_path = "file:///home/lab12/src/data/ironman_wc_2022.csv"  # 정확한 경로 입력
df = spark.read.csv(file_path, header=True, inferSchema=True)

# 데이터 확인
df.show(truncate=False)
df.printSchema()

24/12/19 14:31:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/19 14:31:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+---+--------------------+--------------+------+----+--------+------------+------------+---------+---------+---------+---------+--------+--------+-------------+
|bib|name                |country       |gender|div |div_rank|overall_time|overall_rank|swim_time|swim_rank|bike_time|bike_rank|run_time|run_rank|finish_status|
+---+--------------------+--------------+------+----+--------+------------+------------+---------+---------+---------+---------+--------+--------+-------------+
|8  |Gustav Iden         |Norway        |Male  |MPRO|1       |7:40:24     |1           |48:23:00 |10       |4:11:06  |6        |2:36:15 |1       |Finisher     |
|15 |Sam Laidlow         |France        |Male  |MPRO|2       |7:42:24     |2           |48:16:00 |2        |4:04:36  |1        |2:44:40 |5       |Finisher     |
|1  |Kristian Blummenfelt|Norway        |Male  |MPRO|3       |7:43:23     |3           |48:20:00 |5        |4:11:16  |8        |2:39:21 |2       |Finisher     |
|23 |Max Neumann         |Australi

In [2]:
from pyspark.sql.functions import when, col, split

# 1. 시간 데이터를 초 단위로 변환하는 함수 정의
def time_to_seconds(time_str_col):
    """
    문자열 형태의 시간 데이터를 초 단위로 변환
    """
    return (split(time_str_col, ":")[0].cast("int") * 3600 +
            split(time_str_col, ":")[1].cast("int") * 60 +
            split(time_str_col, ":")[2].cast("int"))

# 2. 초 단위 컬럼 생성
df = df.withColumn("swim_seconds", time_to_seconds(col("swim_time"))) \
       .withColumn("bike_seconds", time_to_seconds(col("bike_time"))) \
       .withColumn("run_seconds", time_to_seconds(col("run_time"))) \
       .withColumn("overall_seconds", time_to_seconds(col("overall_time")))

# 3. 컷오프 기준에 따른 DNF 컬럼 생성
df = df.withColumn(
    "DNF",
    when((col("swim_seconds") > 2 * 3600 + 20 * 60) |  # 수영 컷오프
         (col("swim_seconds") + col("bike_seconds") > 10 * 3600 + 30 * 60) |  # 수영 + 사이클 컷오프
         (col("overall_seconds") > 17 * 3600), 1).otherwise(0)  # 전체 컷오프
)

# 4. 결과 확인
df.select("swim_seconds", "bike_seconds", "run_seconds", "overall_seconds", "DNF").show()

+------------+------------+-----------+---------------+---+
|swim_seconds|bike_seconds|run_seconds|overall_seconds|DNF|
+------------+------------+-----------+---------------+---+
|      174180|       15066|       9375|          27624|  1|
|      173760|       14676|       9880|          27744|  1|
|      174000|       15076|       9561|          27803|  1|
|      174300|       15090|       9614|          27884|  1|
|      190500|       15071|       9926|          28445|  1|
|      190680|       14951|      10125|          28540|  1|
|      190440|       14945|      10168|          28552|  1|
|      179340|       15218|      10091|          28598|  1|
|      179400|       15314|       9960|          28618|  1|
|      178920|       15712|       9719|          28700|  1|
|      190260|       14944|      10467|          28851|  1|
|      173700|       15478|      10229|          28913|  1|
|      174180|       15210|      10563|          28978|  1|
|      174360|       15779|      10023| 

In [3]:
df = df.filter((col("finish_status") != "DNS") & (col("finish_status") != "DQ"))

In [4]:
df = df.dropna(subset=["swim_seconds", "bike_seconds", "run_seconds", "overall_seconds"])

In [5]:
df = df.withColumn("swim_seconds", col("swim_seconds").cast("double")) \
       .withColumn("bike_seconds", col("bike_seconds").cast("double")) \
       .withColumn("run_seconds", col("run_seconds").cast("double")) \
       .withColumn("overall_seconds", col("overall_seconds").cast("double"))

In [6]:
df.groupBy("DNF").count().show()

                                                                                

+---+-----+
|DNF|count|
+---+-----+
|  1|  345|
|  0| 2031|
+---+-----+





In [21]:
df_finishers = df.filter(col("DNF") == 0)
df_finishers.select("DNF").groupBy("DNF").count().show()

+---+-----+
|DNF|count|
+---+-----+
|  0| 2031|
+---+-----+



In [22]:
from pyspark.ml.feature import StringIndexer

# gender 컬럼 인코딩
indexer = StringIndexer(inputCol="gender", outputCol="gender_encoded")
df_finishers = indexer.fit(df_finishers).transform(df_finishers)

# 결과 확인
df_finishers.select("gender", "gender_encoded").show(5)

+------+--------------+
|gender|gender_encoded|
+------+--------------+
|  Male|           0.0|
|  Male|           0.0|
|  Male|           0.0|
|  Male|           0.0|
|  Male|           0.0|
+------+--------------+
only showing top 5 rows



In [23]:
from pyspark.sql.functions import when, col

# div 컬럼을 기반으로 age_group 생성
df_finishers = df_finishers.withColumn(
    "age_group",
    when(col("div").startswith("M18-24"), 18)
    .when(col("div").startswith("M25-29"), 25)
    .when(col("div").startswith("M30-34"), 30)
    .when(col("div").startswith("M35-39"), 35)
    .when(col("div").startswith("M40-44"), 40)
    .when(col("div").startswith("M45-49"), 45)
    .when(col("div").startswith("M50-54"), 50)
    .when(col("div").startswith("M55-59"), 55)
    .when(col("div").startswith("MPRO"), 0)  # 프로 선수
    .otherwise(None)  # 나머지 경우 처리
)

# 결과 확인
df_finishers.select("div", "age_group").distinct().show()

+------+---------+
|   div|age_group|
+------+---------+
|M30-34|       30|
|M25-29|       25|
|M18-24|       18|
|M35-39|       35|
|M40-44|       40|
|M55-59|       55|
|M45-49|       45|
|M50-54|       50|
+------+---------+



In [24]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import when, col

# 1. 순위 그룹 라벨링
total_participants = df_finishers.count()
top_10 = total_participants * 0.1
top_25 = total_participants * 0.25
top_50 = total_participants * 0.5

df_finishers = df_finishers.withColumn(
    "rank_range",
    when(col("overall_rank") <= top_10, "Top 10%")
    .when((col("overall_rank") > top_10) & (col("overall_rank") <= top_25), "Top 25%")
    .when((col("overall_rank") > top_25) & (col("overall_rank") <= top_50), "Top 50%")
    .otherwise("Bottom 50%")
)

# 2. 피처 벡터 생성
assembler = VectorAssembler(
    inputCols=["gender_encoded", "age_group", "swim_seconds", "bike_seconds", "run_seconds"],
    outputCol="features"
)

df_final = assembler.transform(df_finishers).select("features", "rank_range")

# 결과 확인
df_final.show(5, truncate=False)

+---------------------------------+----------+
|features                         |rank_range|
+---------------------------------+----------+
|[0.0,30.0,3655.0,16953.0,11146.0]|Top 10%   |
|[0.0,30.0,3983.0,17318.0,10560.0]|Top 10%   |
|[0.0,30.0,3755.0,17022.0,11094.0]|Top 10%   |
|[0.0,35.0,4042.0,16196.0,11461.0]|Top 10%   |
|[0.0,30.0,3955.0,17532.0,10395.0]|Top 10%   |
+---------------------------------+----------+
only showing top 5 rows



In [25]:
# rank_range를 숫자형 rank_range_index로 변환
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="rank_range", outputCol="rank_range_index")
df_final = indexer.fit(df_final).transform(df_final)

# 결과 확인
df_final.select("rank_range", "rank_range_index").distinct().show()

+----------+----------------+
|rank_range|rank_range_index|
+----------+----------------+
|   Top 10%|             3.0|
|   Top 25%|             2.0|
|Bottom 50%|             0.0|
|   Top 50%|             1.0|
+----------+----------------+



In [26]:
# 데이터 분할
train_data, test_data = df_final.randomSplit([0.8, 0.2], seed=42)

# 데이터 크기 확인
print("훈련 데이터 크기:", train_data.count())
print("테스트 데이터 크기:", test_data.count())

# 데이터 샘플 확인
train_data.show(5, truncate=False)
test_data.show(5, truncate=False)

훈련 데이터 크기: 1671
테스트 데이터 크기: 360
+---------------------------------+----------+----------------+
|features                         |rank_range|rank_range_index|
+---------------------------------+----------+----------------+
|[0.0,18.0,3653.0,20368.0,12874.0]|Top 50%   |1.0             |
|[0.0,18.0,3693.0,19738.0,17137.0]|Bottom 50%|0.0             |
|[0.0,18.0,3722.0,19532.0,17446.0]|Bottom 50%|0.0             |
|[0.0,18.0,3729.0,18523.0,14229.0]|Top 50%   |1.0             |
|[0.0,18.0,3732.0,18024.0,16481.0]|Bottom 50%|0.0             |
+---------------------------------+----------+----------------+
only showing top 5 rows

+---------------------------------+----------+----------------+
|features                         |rank_range|rank_range_index|
+---------------------------------+----------+----------------+
|[0.0,18.0,3717.0,19283.0,13253.0]|Top 50%   |1.0             |
|[0.0,18.0,3776.0,18633.0,12743.0]|Top 50%   |1.0             |
|[0.0,18.0,3855.0,19772.0,14458.0]|Bottom 50%|0

In [30]:
# df_final 스키마 확인
df_final.printSchema()


root
 |-- features: vector (nullable = true)
 |-- rank_range: string (nullable = false)
 |-- rank_range_index: double (nullable = false)



In [29]:
from pyspark.sql.functions import avg

# 학습 데이터에서 그룹별 평균값 계산
avg_stats = train_data.groupBy("rank_range_index").agg(
    avg("swim_seconds").alias("avg_swim"),
    avg("bike_seconds").alias("avg_bike"),
    avg("run_seconds").alias("avg_run")
).toPandas()

# 결과 확인
print(avg_stats)


AnalysisException: cannot resolve '`swim_seconds`' given input columns: [features, rank_range, rank_range_index];
'Aggregate [rank_range_index#1590], [rank_range_index#1590, avg('swim_seconds) AS avg_swim#1772, avg('bike_seconds) AS avg_bike#1774, avg('run_seconds) AS avg_run#1776]
+- Sample 0.0, 0.8, false, 42
   +- Sort [features#1512 ASC NULLS FIRST, rank_range#1486 ASC NULLS FIRST, rank_range_index#1590 ASC NULLS FIRST], false
      +- Project [features#1512, rank_range#1486, UDF(cast(rank_range#1486 as string)) AS rank_range_index#1590]
         +- Project [features#1512, rank_range#1486]
            +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#267, bike_seconds#288, run_seconds#309, overall_seconds#330, DNF#196, gender_encoded#1372, age_group#1414, rank_range#1486, UDF(struct(gender_encoded, gender_encoded#1372, age_group_double_VectorAssembler_69cab572569e, cast(age_group#1414 as double), swim_seconds, swim_seconds#267, bike_seconds, bike_seconds#288, run_seconds, run_seconds#309)) AS features#1512]
               +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#267, bike_seconds#288, run_seconds#309, overall_seconds#330, DNF#196, gender_encoded#1372, age_group#1414, CASE WHEN (cast(overall_rank#23 as double) <= 203.10000000000002) THEN Top 10% WHEN ((cast(overall_rank#23 as double) > 203.10000000000002) AND (cast(overall_rank#23 as double) <= 507.75)) THEN Top 25% WHEN ((cast(overall_rank#23 as double) > 507.75) AND (cast(overall_rank#23 as double) <= 1015.5)) THEN Top 50% ELSE Bottom 50% END AS rank_range#1486]
                  +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#267, bike_seconds#288, run_seconds#309, overall_seconds#330, DNF#196, gender_encoded#1372, CASE WHEN StartsWith(div#20, M18-24) THEN 18 WHEN StartsWith(div#20, M25-29) THEN 25 WHEN StartsWith(div#20, M30-34) THEN 30 WHEN StartsWith(div#20, M35-39) THEN 35 WHEN StartsWith(div#20, M40-44) THEN 40 WHEN StartsWith(div#20, M45-49) THEN 45 WHEN StartsWith(div#20, M50-54) THEN 50 WHEN StartsWith(div#20, M55-59) THEN 55 WHEN StartsWith(div#20, MPRO) THEN 0 ELSE cast(null as int) END AS age_group#1414]
                     +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#267, bike_seconds#288, run_seconds#309, overall_seconds#330, DNF#196, UDF(cast(gender#19 as string)) AS gender_encoded#1372]
                        +- Filter (DNF#196 = 0)
                           +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#267, bike_seconds#288, run_seconds#309, cast(overall_seconds#176 as double) AS overall_seconds#330, DNF#196]
                              +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#267, bike_seconds#288, cast(run_seconds#157 as double) AS run_seconds#309, overall_seconds#176, DNF#196]
                                 +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#267, cast(bike_seconds#139 as double) AS bike_seconds#288, run_seconds#157, overall_seconds#176, DNF#196]
                                    +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, cast(swim_seconds#122 as double) AS swim_seconds#267, bike_seconds#139, run_seconds#157, overall_seconds#176, DNF#196]
                                       +- Filter AtLeastNNulls(n, swim_seconds#122,bike_seconds#139,run_seconds#157,overall_seconds#176)
                                          +- Filter (NOT (finish_status#30 = DNS) AND NOT (finish_status#30 = DQ))
                                             +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#122, bike_seconds#139, run_seconds#157, overall_seconds#176, CASE WHEN (((swim_seconds#122 > 8400) OR ((swim_seconds#122 + bike_seconds#139) > 37800)) OR (overall_seconds#176 > 61200)) THEN 1 ELSE 0 END AS DNF#196]
                                                +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#122, bike_seconds#139, run_seconds#157, (((cast(split(overall_time#22, :, -1)[0] as int) * 3600) + (cast(split(overall_time#22, :, -1)[1] as int) * 60)) + cast(split(overall_time#22, :, -1)[2] as int)) AS overall_seconds#176]
                                                   +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#122, bike_seconds#139, (((cast(split(run_time#28, :, -1)[0] as int) * 3600) + (cast(split(run_time#28, :, -1)[1] as int) * 60)) + cast(split(run_time#28, :, -1)[2] as int)) AS run_seconds#157]
                                                      +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, swim_seconds#122, (((cast(split(bike_time#26, :, -1)[0] as int) * 3600) + (cast(split(bike_time#26, :, -1)[1] as int) * 60)) + cast(split(bike_time#26, :, -1)[2] as int)) AS bike_seconds#139]
                                                         +- Project [bib#16, name#17, country#18, gender#19, div#20, div_rank#21, overall_time#22, overall_rank#23, swim_time#24, swim_rank#25, bike_time#26, bike_rank#27, run_time#28, run_rank#29, finish_status#30, (((cast(split(swim_time#24, :, -1)[0] as int) * 3600) + (cast(split(swim_time#24, :, -1)[1] as int) * 60)) + cast(split(swim_time#24, :, -1)[2] as int)) AS swim_seconds#122]
                                                            +- Relation[bib#16,name#17,country#18,gender#19,div#20,div_rank#21,overall_time#22,overall_rank#23,swim_time#24,swim_rank#25,bike_time#26,bike_rank#27,run_time#28,run_rank#29,finish_status#30] csv


In [13]:
from pyspark.ml.classification import RandomForestClassifier

# Random Forest 모델 생성 및 학습
rf = RandomForestClassifier(labelCol="rank_range_index", featuresCol="features", numTrees=50)
rf_model = rf.fit(train_data)

# 테스트 데이터로 예측
predictions = rf_model.transform(test_data)

# 결과 확인
predictions.select("features", "rank_range_index", "prediction", "probability").show(10, truncate=False)

+---------------------------------+----------------+----------+-----------------------------------------------------------------------------------+
|features                         |rank_range_index|prediction|probability                                                                        |
+---------------------------------+----------------+----------+-----------------------------------------------------------------------------------+
|[0.0,18.0,3717.0,19283.0,13253.0]|1.0             |1.0       |[0.18482090722242833,0.7876868299464286,0.02742917134849327,6.309148264984228E-5]  |
|[0.0,18.0,3776.0,18633.0,12743.0]|1.0             |1.0       |[0.08382126722337407,0.578238432057677,0.29544583802848917,0.04249446269045971]    |
|[0.0,18.0,3855.0,19772.0,14458.0]|0.0             |0.0       |[0.9330593343767631,0.0658595845421557,0.0010810810810810809,0.0]                  |
|[0.0,18.0,3943.0,18864.0,14091.0]|1.0             |1.0       |[0.3013663097761824,0.6525006007234235,0.04576566

In [14]:
# 필요한 모듈 임포트
from pyspark.ml.classification import RandomForestClassifier

# 시간 데이터를 초 단위로 변환하는 함수
def time_to_seconds(time_str):
    h, m, s = map(int, time_str.split(":"))
    return h * 3600 + m * 60 + s

# 초를 hh:mm:ss 형식으로 변환하는 함수
def seconds_to_time(seconds):
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    return f"{h:02}:{m:02}:{s:02}"

# 클래스 매핑
class_mapping = {
    0.0: "Bottom 50%",
    1.0: "Top 50%",
    2.0: "Top 25%",
    3.0: "Top 10%"
}

# 예상 기록 계산 함수
def calculate_average_times(predicted_group):
    avg_times = {
        "Bottom 50%": {"swim": 1.5 * 3600, "bike": 6.5 * 3600, "run": 4.5 * 3600},
        "Top 50%": {"swim": 1.4 * 3600, "bike": 6.0 * 3600, "run": 4.0 * 3600},
        "Top 25%": {"swim": 1.3 * 3600, "bike": 5.5 * 3600, "run": 3.5 * 3600},
        "Top 10%": {"swim": 1.2 * 3600, "bike": 5.0 * 3600, "run": 3.0 * 3600},
    }
    return avg_times[predicted_group]

# 사용자 입력 데이터 받기
print("자신의 정보를 입력해주세요:")
gender = input("성별 (남성/여성): ")
age = int(input("나이: "))
swim_time = input("수영 기록 (hh:mm:ss, 1.5km 기준): ")
bike_time = input("자전거 기록 (hh:mm:ss, 40km 기준): ")
run_time = input("달리기 기록 (hh:mm:ss, 10km 기준): ")

# 성별 인코딩 및 시간 변환
gender_encoded = 1.0 if gender == "남성" else 0.0
swim_seconds = time_to_seconds(swim_time) * (3.8 / 1.5)  # 수영 3.8km로 변환
bike_seconds = time_to_seconds(bike_time) * (180 / 40)  # 자전거 180km로 변환
run_seconds = time_to_seconds(run_time) * (42.2 / 10)  # 달리기 42.2km로 변환

# 입력 데이터 생성
input_data = [[gender_encoded, age, swim_seconds, bike_seconds, run_seconds]]
input_df = spark.createDataFrame(input_data, schema=["gender_encoded", "age_group", "swim_seconds", "bike_seconds", "run_seconds"])
input_features = assembler.transform(input_df)

# 모델 예측
predictions = rf_model.transform(input_features)
prediction = predictions.select("prediction", "probability").collect()[0]
predicted_rank = class_mapping[prediction["prediction"]]
probabilities = prediction["probability"]

# 예상 기록 계산
average_times = calculate_average_times(predicted_rank)

# 결과 출력
print("\n모델 예측 결과:")
print(f"- 예상 종합 순위 그룹: {predicted_rank}")
print(f"- 예상 부문 순위 그룹 (성별: {gender}, 나이 그룹: {age // 10 * 10}대): {predicted_rank}")

print("\n예상 기록:")
print(f"    - 수영: {seconds_to_time(average_times['swim'])}")
print(f"    - 자전거: {seconds_to_time(average_times['bike'])}")
print(f"    - 달리기: {seconds_to_time(average_times['run'])}")

print("\n종목별 개선 방향:")
print("    - 수영: 상위 10%에 진입하려면 1시간 5분 이하로 줄여야 합니다.")
print("    - 자전거: 상위 10%에 진입하려면 6시간 0분 이하로 줄여야 합니다.")
print("    - 달리기: 상위 10%에 진입하려면 3시간 0분 이하로 줄여야 합니다.")


자신의 정보를 입력해주세요:


KeyboardInterrupt: Interrupted by user

In [None]:
spark.stop()