In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer,StandardScaler
from pyspark.ml.classification import LinearSVC
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder.appName("TennisSVM").getOrCreate()

In [3]:
df = spark.read.csv("./game_based_df.csv", header=True, inferSchema=True)

In [4]:
input_columns = [
    'G_P1GmWon', 'G_P2GmWon', 'G_Ser', 'G_P1Ace', 'G_P2Ace', 'G_P1Wn', 'G_P2Wn', 
    'G_P1Df', 'G_P2Df', 'G_P1UE', 'G_P2UE', 'G_P1NP', 'G_P2NP', 'G_P1NPW', 'G_P2NPW', 
    'G_P1BP', 'G_P2BP', 'G_P1BPWon', 'G_P2BPWon', 'G_P1FW', 'G_P1BW', 'G_P2FW', 'G_P2BW', 
    'G_P1SerW', 'G_P2SerW', 'G_avg_SerSp'
]

In [5]:
indexer = StringIndexer(inputCol="SetWinP", outputCol="label")
df = indexer.fit(df).transform(df)
# 将输入特征列合并为一个特征向量
assembler = VectorAssembler(inputCols=input_columns, outputCol="features")
df = assembler.transform(df)

In [6]:
# Scale the features 
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(df)
df = scaler_model.transform(df)

In [7]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=1234)

df = df.repartition(50)  # 适当调整分区数

# 调整超参数：减少最大迭代次数和容忍度
svm = LinearSVC(featuresCol="features", labelCol="label", maxIter=50, tol=1e-4)

param_grid = ParamGridBuilder() \
    .addGrid(svm.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(svm.maxIter, [10, 50, 100]) \
    .addGrid(svm.tol, [1e-4, 1e-5, 1e-6]) \
    .build()

# Initialize the evaluator
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction")

# Set up CrossValidator for hyperparameter tuning
crossval = CrossValidator(estimator=svm,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=3)  # 3-fold cross-validation
cv_model = crossval.fit(train_df)
predictions = cv_model.transform(test_df)

In [15]:
TP = predictions.filter((col("prediction") == 1) & (col("label") == 1)).count()
FP = predictions.filter((col("prediction") == 1) & (col("label") == 0)).count()
TN = predictions.filter((col("prediction") == 0) & (col("label") == 0)).count()
FN = predictions.filter((col("prediction") == 0) & (col("label") == 1)).count()

# 精确率 = TP / (TP + FP)
precision_manual = TP / (TP + FP) if (TP + FP) != 0 else 0
# 召回率 = TP / (TP + FN)
recall_manual = TP / (TP + FN) if (TP + FN) != 0 else 0
# F1分数 = 2 * (精确率 * 召回率) / (精确率 + 召回率)
f1_score_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual) if (precision_manual + recall_manual) != 0 else 0

print(f"Precision: {precision_manual}")
print(f"Recall: {recall_manual}")
print(f"F1 Score: {f1_score_manual}")

Precision: 0.7462141868366962
Recall: 0.7390220597399361
F1 Score: 0.7426007096639532


In [30]:
evaluator = BinaryClassificationEvaluator(labelCol="label")
precision = evaluator.evaluate(predictions)
print(f"Precision: {precision}")

Precision: 0.8355882271533153
