In [None]:
# Cell 1: Import necessary libraries and initialize Spark session
import pyspark
from pyspark.sql import SparkSession
from pprint import pprint
from pyspark.sql.functions import col, regexp_replace, when
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import matplotlib.pyplot as plt
import numpy as np

# Initialize Spark session
spark = SparkSession.builder.appName("FIFA Clustering").getOrCreate()


In [None]:
# Cell 2: Read the data and display the schema
# Read the data
data = spark.read.csv("dataFIFA.csv", header=True, inferSchema=True)

# Display the schema
data.printSchema()


In [None]:
# Cell 3: Select relevant columns and handle missing values, including converting Wage to numeric
# Select relevant columns and handle missing values
selected_features = ["Name", "Overall", "Potential", "Age", "Stamina", "Dribbling", "Wage"]
data = data.select(selected_features).dropna()

# Function to convert to numeric, handling "M" for millions and "K" for thousands
def convert_currency(col):
    return (
        when(col.endswith("M"), regexp_replace(col, "[€M]", "").cast("double") * 1e6)
        .when(col.endswith("K"), regexp_replace(col, "[€K]", "").cast("double") * 1e3)
        .otherwise(regexp_replace(col, "€", "").cast("double"))
    )

# Convert Wage to numeric
data = data.withColumn("Wage", convert_currency(col("Wage")))


In [None]:
# Cell 4: Assemble features into a feature vector and standardize the features
# Assemble features into a feature vector
assembler = VectorAssembler(inputCols=["Overall", "Potential", "Age", "Stamina", "Dribbling"], outputCol="features")
data = assembler.transform(data)

# Standardizing the features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(data)
data = scalerModel.transform(data)


In [None]:
# Cell 5: Finding the top 10 players with the highest salaries
top_10_salaries = data.orderBy(data['Wage'].desc()).limit(10)
top_10_salaries_wages = [row['Wage'] for row in top_10_salaries.select("Wage").collect()]
top_10_salaries_names = [row['Name'] for row in top_10_salaries.select("Name").collect()]


In [None]:
# Cell 6: Find the optimal number of clusters with a higher range
silhouette_scores = []
evaluator = ClusteringEvaluator(predictionCol='prediction', metricName='silhouette', featuresCol='scaledFeatures')

for k in range(2, 11):  # Trying a range of clusters
    kmeans = KMeans(k=k, featuresCol='scaledFeatures', seed=1)
    model = kmeans.fit(data)
    predictions = model.transform(data)
    silhouette = evaluator.evaluate(predictions)
    silhouette_scores.append((k, silhouette))

optimal_clusters = max(silhouette_scores, key=lambda x: x[1])[0]
print(f"Optimal number of clusters: {optimal_clusters}")


In [None]:
# Cell 7: Evaluate clustering on original features
evaluator = ClusteringEvaluator(predictionCol='prediction', metricName='silhouette', featuresCol='scaledFeatures')
kmeans = KMeans(k=optimal_clusters, featuresCol='scaledFeatures', seed=1)
model = kmeans.fit(data)
predictions = model.transform(data)
original_silhouette = evaluator.evaluate(predictions)


In [None]:
# Cell 8: Define function for PCA and clustering
def apply_pca_and_cluster(n_components, data, top_10_salaries_wages):
    pca = PCA(k=n_components, inputCol="scaledFeatures", outputCol=f"pcaFeatures_{n_components}")
    pca_model = pca.fit(data)
    pca_data = pca_model.transform(data)

    kmeans = KMeans(k=optimal_clusters, featuresCol=f'pcaFeatures_{n_components}', seed=1)
    model = kmeans.fit(pca_data)
    predictions = model.transform(pca_data)
    
    cluster_summary = predictions.groupBy('prediction').count().orderBy('prediction')
    cluster_summary.show()
    
    # Count the number of top 10 salary players in each cluster
    top_10_predictions = predictions.filter(predictions["Wage"].isin(top_10_salaries_wages))
    cluster_counts = top_10_predictions.groupBy('prediction').count().orderBy('prediction').collect()
    
    return cluster_counts, evaluator.evaluate(predictions)


In [None]:
# Cell 9: Apply PCA and clustering with 5, 4, and 3 components
components = [5, 4, 3]
cluster_results = {}
pca_silhouettes = {}
for n in components:
    print(f"For {n} components:")
    cluster_results[n], silhouette = apply_pca_and_cluster(n, data, top_10_salaries_wages)
    pca_silhouettes[n] = silhouette


In [None]:
# Cell 10: Display results in a table for 5, 4, and 3 components
for n in components:
    print(f"PCA with {n} components:")
    print("+----------+-----+")
    print("|clusterID |The number of players from the 10 players with biggest salary |")
    print("+----------+-----+")

    # Create a dictionary to hold the counts of top salary players in each cluster
    cluster_counts = {cluster: 0 for cluster in range(optimal_clusters)}
    
    # Count the number of top 10 salary players in each cluster
    for row in cluster_results[n]:
        cluster_counts[row['prediction']] += row['count']
    
    # Use pprint to display the results in a pretty format
    pprint([{ 'clusterID': cluster, 'top_10_count': count } for cluster, count in cluster_counts.items()])
    
    print("+----------+-----+")

# Compare silhouette scores
print(f"Silhouette score for original features: {original_silhouette}")
for n in components:
    print(f"Silhouette score for PCA with {n} components: {pca_silhouettes[n]}")


In [None]:
# Cell 11: Apply PCA to reduce the dimensions to 2 components and plot the results
pca = PCA(k=2, inputCol="scaledFeatures", outputCol="pcaFeatures_2")
pca_model = pca.fit(data)
pca_data = pca_model.transform(data)

# Clustering using KMeans with the new optimal number of clusters
kmeans = KMeans(k=optimal_clusters, featuresCol='pcaFeatures_2', seed=1)
model = kmeans.fit(pca_data)
predictions = model.transform(pca_data)


In [None]:
# Cell 12: Plotting the results with updated clusters
pca_features = np.array(pca_data.select("pcaFeatures_2").rdd.map(lambda row: row[0]).collect())
cluster_assignments = np.array(predictions.select("prediction").rdd.map(lambda row: row[0]).collect())
wages = np.array(predictions.select("Wage").rdd.map(lambda row: row[0]).collect())
names = np.array(predictions.select("Name").rdd.map(lambda row: row[0]).collect())

top_10_wages_set = set(top_10_salaries_wages)
top_10_names_set = set(top_10_salaries_names)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(pca_features[:, 0], pca_features[:, 1], c=cluster_assignments, cmap='viridis', marker='o', label='Clusters')

# Highlighting top 10 players with the biggest salary and labeling with player names
labeled_names = set()
for i in range(len(pca_features)):
    if names[i] in top_10_names_set and names[i] not in labeled_names:
        plt.scatter(pca_features[i, 0], pca_features[i, 1], color='red', edgecolors='w', s=200)
        plt.text(pca_features[i, 0], pca_features[i, 1], names[i], fontsize=9, ha='right')
        labeled_names.add(names[i])

plt.title('2D PCA Plot of Clusters and Top 10 Salary Players')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()
plt.show()


In [None]:
# Cell 13: Print the players with the biggest wage
print("Players with the highest wages:")
for i, wage in enumerate(top_10_salaries_wages):
    print(f"Player: {top_10_salaries_names[i]}, Wage: €{wage:.0f}")