In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, corr
import numpy as np
import pickle as pkl

In [2]:
spark_home = "D:\Ali_Other\Spark\spark_unzipped"
os.environ["SPARK_HOME"] = spark_home

# Add Spark bin and executors to PATH
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "bin")
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "sbin")

# Add Spark Python libraries to PYTHONPATH
os.environ["PYTHONPATH"] = os.path.join(spark_home, "python") + os.pathsep + os.environ.get("PYTHONPATH", "")
os.environ["PYTHONPATH"] += os.pathsep + os.path.join(spark_home, "python", "lib")

# Add PySpark to the system path
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "python", "lib", "pyspark.zip")
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "python", "lib", "py4j-0.10.9-src.zip")

os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("PySpark-Script") \
    .getOrCreate()

In [4]:
# Load the data
data = spark.read.csv("Data/cleaned_data.csv", header=True, inferSchema=True)
# print data schema
data.printSchema()


root
 |-- player_id: double (nullable = true)
 |-- fifa_version: double (nullable = true)
 |-- short_name: string (nullable = true)
 |-- player_positions: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- potential: double (nullable = true)
 |-- value_eur: double (nullable = true)
 |-- wage_eur: double (nullable = true)
 |-- age: double (nullable = true)
 |-- height_cm: double (nullable = true)
 |-- weight_kg: double (nullable = true)
 |-- league_id: double (nullable = true)
 |-- league_name: string (nullable = true)
 |-- league_level: double (nullable = true)
 |-- club_team_id: double (nullable = true)
 |-- club_name: string (nullable = true)
 |-- club_position: string (nullable = true)
 |-- club_jersey_number: double (nullable = true)
 |-- club_contract_valid_until_year: double (nullable = true)
 |-- nationality_id: double (nullable = true)
 |-- nationality_name: string (nullable = true)
 |-- preferred_foot: double (nullable = true)
 |-- weak_foot: double (nullable

In [5]:
K = 5
X = ['short_name', 'overall', 'value_eur', 'wage_eur', 'international_reputation', 'potential', 'movement_reactions', 'body_type_Unique', 'mentality_composure', 'rcm', 'cm', 'lcm', 'mentality_vision', 'ram', 'cam', 'lam', 'rm', 'lm', 'rs', 'st', 'ls', 'cf', 'lf', 'rf']


In [6]:
# Select the features
data = data.select(X)

# split the data into training and testing
train, test = data.randomSplit([0.8, 0.2])

# # Correlation matrix
# corr_matrix = np.zeros((len(X), len(X)))
# for i in range(1, len(X)):
#     for j in range(1, len(X)):
#         corr_matrix[i, j] = train.corr(X[i], X[j])


In [7]:
# Split the train into features and labels
# the features are X[2:] and the label is X[1]
features = train.select(X[2:])
label = train.select(X[1])




In [8]:
# Take one random sample from test
sample = test.sample(False, 0.1)
# print it
print(sample.take(1))

[Row(short_name='A. Ajagun', overall=0.5555555555555556, value_eur=0.0077268439528038815, wage_eur=0.051348999129677976, international_reputation=0.0, potential=0.6538461538461536, movement_reactions=0.5921052631578947, body_type_Unique=0.0, mentality_composure=0.5891666677255442, rcm=0.5974025974025974, cm=0.5974025974025974, lcm=0.5974025974025974, mentality_vision=0.6989247311827957, ram=0.6835443037974683, cam=0.6835443037974683, lam=0.6835443037974683, rm=0.6794871794871795, lm=0.6794871794871795, rs=0.6582278481012659, st=0.6582278481012659, ls=0.6582278481012659, cf=0.6875, lf=0.6875, rf=0.6875)]


In [9]:
train_rdd = train.rdd

In [23]:
distances = train_rdd.map(lambda x: (x[0], np.dot(x[2:], sample[2:])))
# topK = distances.top(K, lambda x: x[1])
# print(distances)
# print distances

# show first element in train_rdd
print(train_rdd.take(1))
print(distances.take(1))


[Row(short_name='A. Abdennour', overall=0.7222222222222221, value_eur=0.0386548384270022, wage_eur=0.10356832027850305, international_reputation=0.25, potential=0.769230769230769, movement_reactions=0.736842105263158, body_type_Unique=0.0, mentality_composure=0.5891666677255442, rcm=0.6623376623376624, cm=0.6623376623376624, lcm=0.6623376623376624, mentality_vision=0.5268817204301076, ram=0.5949367088607596, cam=0.5949367088607596, lam=0.5949367088607596, rm=0.6282051282051282, lm=0.6282051282051282, rs=0.6075949367088609, st=0.6075949367088609, ls=0.6075949367088609, cf=0.625, lf=0.625, rf=0.625)]


Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\serializers.py", line 459, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 632, in dump
    return Pickler.dump(self, obj)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\context.py", line 466, in __getnewargs__
    raise PySparkRuntimeError(
pyspark.errors.exceptions.base.PySparkRuntimeError: [CONTEXT_ONLY_VALID_ON_DRIVER] It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more informatio

PicklingError: Could not serialize object: PySparkRuntimeError: [CONTEXT_ONLY_VALID_ON_DRIVER] It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.

In [None]:
def sort_locally(partition):
    # calculate distance between each point and the 


In [None]:
items_rdd = league_rdd.map(itemize)

In [None]:
def sort_and_itemize(partition):
    sorted_partition = sorted(partition, key=lambda x: x[1])  # Adjust the key function as needed
    return map(itemize, sorted_partition)

sorted_items_rdd = league_rdd.mapPartitions(sort_and_itemize)