In [12]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [2]:
spark = SparkSession.builder.appName("LogisticRegression").getOrCreate()

In [3]:
df = spark.read.csv("./game_based_df.csv", header=True, inferSchema=True)

In [4]:
df.printSchema()

root
 |-- match_id: string (nullable = true)
 |-- SetNo: integer (nullable = true)
 |-- GameNo: integer (nullable = true)
 |-- SetWinP: integer (nullable = true)
 |-- G_Winner: integer (nullable = true)
 |-- G_P1GmWon: integer (nullable = true)
 |-- G_P2GmWon: integer (nullable = true)
 |-- G_Ser: integer (nullable = true)
 |-- G_P1Ace: integer (nullable = true)
 |-- G_P2Ace: integer (nullable = true)
 |-- G_P1Wn: integer (nullable = true)
 |-- G_P2Wn: integer (nullable = true)
 |-- G_P1Df: integer (nullable = true)
 |-- G_P2Df: integer (nullable = true)
 |-- G_P1UE: integer (nullable = true)
 |-- G_P2UE: integer (nullable = true)
 |-- G_P1NP: integer (nullable = true)
 |-- G_P2NP: integer (nullable = true)
 |-- G_P1NPW: integer (nullable = true)
 |-- G_P2NPW: integer (nullable = true)
 |-- G_P1BP: integer (nullable = true)
 |-- G_P2BP: integer (nullable = true)
 |-- G_P1BPWon: integer (nullable = true)
 |-- G_P2BPWon: integer (nullable = true)
 |-- G_P1FW: integer (nullable = true)
 |

In [5]:
df.show(vertical=True)

-RECORD 0---------------------------
 match_id      | 2011-ausopen-1112  
 SetNo         | 4                  
 GameNo        | 1                  
 SetWinP       | 2                  
 G_Winner      | 2                  
 G_P1GmWon     | 0                  
 G_P2GmWon     | 1                  
 G_Ser         | 2                  
 G_P1Ace       | 0                  
 G_P2Ace       | 0                  
 G_P1Wn        | 0                  
 G_P2Wn        | 1                  
 G_P1Df        | 0                  
 G_P2Df        | 0                  
 G_P1UE        | 0                  
 G_P2UE        | 0                  
 G_P1NP        | 0                  
 G_P2NP        | 2                  
 G_P1NPW       | 0                  
 G_P2NPW       | 1                  
 G_P1BP        | 0                  
 G_P2BP        | 0                  
 G_P1BPWon     | 0                  
 G_P2BPWon     | 0                  
 G_P1FW        | 0                  
 G_P1BW        | 0                  
 

In [6]:
df.columns

['match_id',
 'SetNo',
 'GameNo',
 'SetWinP',
 'G_Winner',
 'G_P1GmWon',
 'G_P2GmWon',
 'G_Ser',
 'G_P1Ace',
 'G_P2Ace',
 'G_P1Wn',
 'G_P2Wn',
 'G_P1Df',
 'G_P2Df',
 'G_P1UE',
 'G_P2UE',
 'G_P1NP',
 'G_P2NP',
 'G_P1NPW',
 'G_P2NPW',
 'G_P1BP',
 'G_P2BP',
 'G_P1BPWon',
 'G_P2BPWon',
 'G_P1FW',
 'G_P1BW',
 'G_P2FW',
 'G_P2BW',
 'G_P1SerW',
 'G_P2SerW',
 'G_avg_SerSp',
 'G_totalPoints']

In [7]:
indexer = StringIndexer(inputCol="SetWinP", outputCol="label")
df = indexer.fit(df).transform(df)

In [8]:
# 特征选择
input_features = [
    'G_P1GmWon', 'G_P2GmWon', 'G_Ser', 'G_P1Ace', 'G_P2Ace', 'G_P1Wn', 
    'G_P2Wn', 'G_P1Df', 'G_P2Df', 'G_P1UE', 'G_P2UE', 'G_P1NP', 'G_P2NP', 
    'G_P1NPW', 'G_P2NPW', 'G_P1BP', 'G_P2BP', 'G_P1BPWon', 'G_P2BPWon', 
    'G_P1FW', 'G_P1BW', 'G_P2FW', 'G_P2BW', 'G_P1SerW', 'G_P2SerW', 'G_avg_SerSp'
]

# 将特征列合并为单个向量
assembler = VectorAssembler(inputCols=input_features, outputCol="features")
df = assembler.transform(df)


In [13]:
# 特征标准化
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(df)
df = scaler_model.transform(df)

In [14]:
# 数据分割为训练集和测试集
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [15]:
# 初始化逻辑回归模型
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, family="binomial")


In [16]:
# 设置交叉验证和超参数网格
param_grid = (ParamGridBuilder()
              .addGrid(lr.regParam, [0.01, 0.1, 0.5])  # 正则化参数
              .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])  # 弹性网参数
              .build())

# 交叉验证器
evaluator_roc_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
crossval = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator_roc_auc, numFolds=5)

# 模型训练
cv_model = crossval.fit(train_data)

In [18]:
# 在测试集上评估模型
test_results = cv_model.transform(test_data)

In [19]:
# 模型评估
evaluator_roc_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

In [20]:
roc_auc = evaluator_roc_auc.evaluate(test_results)
accuracy = evaluator_accuracy.evaluate(test_results)
precision = evaluator_precision.evaluate(test_results)
recall = evaluator_recall.evaluate(test_results)
f1_score = evaluator_f1.evaluate(test_results)

In [21]:
# 输出评估结果
print(f"Test ROC-AUC: {roc_auc:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-Score: {f1_score:.4f}")

Test ROC-AUC: 0.8415
Test Accuracy: 0.7402
Test Precision: 0.7402
Test Recall: 0.7402
Test F1-Score: 0.7401
