In [1]:
### import libraries
import os 
import pandas as pd
from tqdm import tqdm
import numpy as np

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext
import pyspark.sql.functions as F 
from pprint import pprint

In [2]:
# init a spark session
appName = "Fifa_EDA"
master = "local"


sc = SparkSession.builder.appName(appName).getOrCreate()
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession.builder.getOrCreate()

23/10/31 15:41:42 WARN Utils: Your hostname, dylanli3090 resolves to a loopback address: 127.0.1.1; using 100.110.155.221 instead (on interface wlp37s0)
23/10/31 15:41:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/31 15:41:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/31 15:41:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Read data 
data_path = '/home/dylan/repo/CMU18763_Projects1/full_data.csv'
data = spark.read.csv(data_path, header=True, inferSchema=True)

                                                                                

In [4]:
##### Drop Columns that Missing Value are more than 25%
def missing_value_col(df):
    cols_to_drop = []
    for i in tqdm(df.columns):
        missing = df.filter(F.col(i).isNull()).count() / df.count() * 100
        if missing > 25:
            print('{} - {}%'.format(i, round(missing)))
            cols_to_drop.append(i)
    return cols_to_drop

In [5]:
missing_value_col(data)

  0%|          | 0/113 [00:00<?, ?it/s]23/10/31 15:43:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///home/dylan/repo/CMU18763_Projects1/full_data.csv
 19%|█▉        | 22/113 [00:04<00:13,  6.50it/s]

club_loaned_from - 94%


 24%|██▍       | 27/113 [00:05<00:12,  6.85it/s]

nation_team_id - 94%
nation_position - 94%


 26%|██▌       | 29/113 [00:05<00:11,  7.05it/s]

nation_jersey_number - 94%


 33%|███▎      | 37/113 [00:06<00:10,  7.30it/s]

release_clause_eur - 39%
player_tags - 92%


 35%|███▍      | 39/113 [00:06<00:10,  7.31it/s]

player_traits - 55%


 71%|███████   | 80/113 [00:12<00:04,  7.78it/s]

goalkeeping_speed - 89%


 98%|█████████▊| 111/113 [00:16<00:00,  7.69it/s]

nation_logo_url - 94%


100%|██████████| 113/113 [00:16<00:00,  6.77it/s]


['club_loaned_from',
 'nation_team_id',
 'nation_position',
 'nation_jersey_number',
 'release_clause_eur',
 'player_tags',
 'player_traits',
 'goalkeeping_speed',
 'nation_logo_url']

In [6]:
data.show(5)

+---+---------+--------------------+-----------------+--------------------+----------------+-------+---------+---------+--------+---+----------+---------+---------+------------+-------------------+--------------------+------------+-------------+------------------+----------------+-----------+-------------------------+--------------+----------------+--------------+---------------+--------------------+--------------+---------+-----------+------------------------+-----------+---------+---------+------------------+--------------------+--------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+------

23/10/31 15:46:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/10/31 15:46:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , sofifa_id, player_url, short_name, long_name, player_positions, overall, potential, value_eur, wage_eur, age, dob, height_cm, weight_kg, club_team_id, club_name, league_name, league_level, club_position, club_jersey_number, club_loaned_from, club_joined, club_contract_valid_until, nationality_id, nationality_name, nation_team_id, nation_position, nation_jersey_number, preferred_foot, weak_foot, skill_moves, international_reputation, work_rate, body_type, real_face, release_clause_eur, player_tags, player_traits, pace, shooting, passing, dribbling, defending, physic, attacking_crossing, attacking_finishing, attacking_heading_accuracy, attacking_short_passing, attacking_volleys, skill_dribbling, skill_curve, 

In [7]:
data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- sofifa_id: integer (nullable = true)
 |-- player_url: string (nullable = true)
 |-- short_name: string (nullable = true)
 |-- long_name: string (nullable = true)
 |-- player_positions: string (nullable = true)
 |-- overall: integer (nullable = true)
 |-- potential: integer (nullable = true)
 |-- value_eur: double (nullable = true)
 |-- wage_eur: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- dob: date (nullable = true)
 |-- height_cm: integer (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- club_team_id: double (nullable = true)
 |-- club_name: string (nullable = true)
 |-- league_name: string (nullable = true)
 |-- league_level: double (nullable = true)
 |-- club_position: string (nullable = true)
 |-- club_jersey_number: double (nullable = true)
 |-- club_loaned_from: string (nullable = true)
 |-- club_joined: date (nullable = true)
 |-- club_contract_valid_until: double (nullable = true)
 |-- nationalit

- we can see that there  are some name and url columns that are not useful for our analysis. We will drop them.
- We should drop missing value > 25% columns.
- For dob and age, they provide same information. We keep age only for convenience.
- For `club_team_id` and `club_name`. we keep `club_team_id` only. Because `club_team_id` is unique for each `club_name`, it is already a StringIndex col. We can use it directly for our analysis.
- For `club_joined`, this is the date that the player joined the club. We will keep it then transfer it to years of joined and how many years the player has been in the club.
- For `nationality_id` and `nationality_name`, we keep first one. 

In [None]:
# To check if club_team_id and club_name is one to one. 
from pyspark.sql import DataFrame

def check_one_to_one(df: DataFrame, col1: str, col2: str):
    # Group by col1 and count distinct values of col2
    counts = df.groupBy(col1).agg(F.countDistinct(col2).alias('count'))

    # Check if the maximum count is 1
    max_count = counts.agg(F.max('count')).first()[0]
    if max_count == 1: 
        return print(f'{col1} and {col2} is one by one.')
    else: 
        return print(f'{col1} and {col2} is different. The max_diff_count is : {max_count}')

check_one_to_one(data, 'club_team_id', 'club_name')

In [None]:
# To check if sofifa_id and long_name is one to one.
check_one_to_one(data, 'sofifa_id', 'long_name')
check_one_to_one(data, 'short_name', 'long_name')

In [None]:
# To check if national_id and national_name is one by one. 
check_one_to_one(data, 'nationality_id', 'nationality_name')
check_one_to_one(data, 'nation_team_id', 'nation_position')

In [None]:
from pyspark.sql import DataFrame

def find_url_columns(df: DataFrame):
    # Get a list of column names
    column_names = df.columns 
    
    url_columns = [col for col in column_names if 'url' in col]
    
    return url_columns 

print(f'The columns contain url are {find_url_columns(data)}')

In [8]:
## identify other unuseful columns
useless_cols = ['player_url', 'player_face_url', 'club_logo_url', 
                'club_flag_url', 'nation_logo_url', 'nation_flag_url', 'sofifa_id', 
                'short_name', 'dob', 'club_name','club_jersey_number', 'club_loaned_from', 
                'nationality_name', 'nation_jersey_number', 'body_type','real_face', 'goalkeeping_speed', 
                'club_contract_valid_until', 'club_loaned_from', 'nation_team_id', 'nation_position',
                'release_clause_eur', 'player_tags', 
                'player_traits']

Now we drop these columns. 

In [9]:
data = data.drop(*useless_cols)

In [10]:
data.show(5)

+---+--------------------+----------------+-------+---------+---------+--------+---+---------+---------+------------+--------------------+------------+-------------+-----------+--------------+--------------+---------+-----------+------------------------+-----------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+----+----+-

23/10/31 15:47:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , long_name, player_positions, overall, potential, value_eur, wage_eur, age, height_cm, weight_kg, club_team_id, league_name, league_level, club_position, club_joined, nationality_id, preferred_foot, weak_foot, skill_moves, international_reputation, work_rate, pace, shooting, passing, dribbling, defending, physic, attacking_crossing, attacking_finishing, attacking_heading_accuracy, attacking_short_passing, attacking_volleys, skill_dribbling, skill_curve, skill_fk_accuracy, skill_long_passing, skill_ball_control, movement_acceleration, movement_sprint_speed, movement_agility, movement_reactions, movement_balance, power_shot_power, power_jumping, power_stamina, power_strength, power_long_shots, mentality_aggression, mentality_interceptions, mentality_positioning, mentality_vision, mentality_penalties, mentality_composure, defending_marking_awareness, defending_standing_tackle, defending_sliding_t

Now, we have a clean datase, we can start our analysis. #TODO 

Let's explore the mode of `club_joined` 

In [12]:
from pyspark.sql.functions import when, count, col

missing_values = data.select([count(when(col('club_joined').isNull(), 1)).alias('club_joined')]).show()

+-----------+
|club_joined|
+-----------+
|       9935|
+-----------+



In [19]:
data = data.withColumn('club_joined_unix', F.unix_timestamp(F.col('club_joined'))).drop('club_joined')

In [20]:
data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- long_name: string (nullable = true)
 |-- player_positions: string (nullable = true)
 |-- overall: integer (nullable = true)
 |-- potential: integer (nullable = true)
 |-- value_eur: double (nullable = true)
 |-- wage_eur: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- height_cm: integer (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- club_team_id: double (nullable = true)
 |-- league_name: string (nullable = true)
 |-- league_level: double (nullable = true)
 |-- club_position: string (nullable = true)
 |-- nationality_id: integer (nullable = true)
 |-- preferred_foot: string (nullable = true)
 |-- weak_foot: integer (nullable = true)
 |-- skill_moves: integer (nullable = true)
 |-- international_reputation: integer (nullable = true)
 |-- work_rate: string (nullable = true)
 |-- pace: double (nullable = true)
 |-- shooting: double (nullable = true)
 |-- passing: double (nullable = true)
 |-- dribbling: dou

In [21]:
# test
df_missing = data.filter(F.col("club_joined_unix").isNull())
df_non_missing = data.filter(F.col("club_joined_unix").isNotNull())

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

features = ["feature1", "feature2"]  # Replace with your actual features
assembler = VectorAssembler(inputCols=features, outputCol="features")
kmeans = KMeans(k=3)  # Choose an appropriate number of clusters

pipeline = Pipeline(stages=[assembler, kmeans])
model = pipeline.fit(df_non_missing)