In [None]:
### import libraries
import os 
import pandas as pd
from tqdm import tqdm
import numpy as np

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext
import pyspark.sql.functions as F 
from pprint import pprint

In [None]:
# init a spark session
appName = "Fifa_EDA"
master = "local"


sc = SparkSession.builder.appName(appName).getOrCreate()
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession.builder.getOrCreate()

In [None]:
# Read data 
data_path = '/Users/dylan/DylanLi/Code_Repo/CMU18763_Projects1/full_data.csv'
data = spark.read.csv(data_path, header=True, inferSchema=True)

In [None]:
data.show(5)

In [None]:
data.printSchema()

- we can see that there  are some name and url columns that are not useful for our analysis. We will drop them.
- We will drop `long_name`, because `short_name` and `sofifa_id` lacks representation information.
- For dob and age, they provide same information. We keep age only for convenience.
- For `club_team_id` and `club_name`. we keep `club_team_id` only. Because `club_team_id` is unique for each `club_name`, it is already a StringIndex col. We can use it directly for our analysis.
- For `club_joined`, this is the date that the player joined the club. We will keep it then transfer it to years of joined and how many years the player has been in the club.
- For `nationality_id` and `nationality_name`, we keep first one. 
- For `tags` and `player_traits`, consider split it to different columns.

In [None]:
# To check if club_team_id and club_name is one to one. 
from pyspark.sql import DataFrame

def check_one_to_one(df: DataFrame, col1: str, col2: str):
    # Group by col1 and count distinct values of col2
    counts = df.groupBy(col1).agg(F.countDistinct(col2).alias('count'))

    # Check if the maximum count is 1
    max_count = counts.agg(F.max('count')).first()[0]
    if max_count == 1: 
        return print(f'{col1} and {col2} is one by one.')
    else: 
        return print(f'{col1} and {col2} is different. The max_diff_count is : {max_count}')

check_one_to_one(data, 'club_team_id', 'club_name')

In [None]:
# To check if sofifa_id and long_name is one to one.
check_one_to_one(data, 'sofifa_id', 'long_name')
check_one_to_one(data, 'short_name', 'long_name')

In [None]:
# To check if national_id and national_name is one by one. 
check_one_to_one(data, 'nationality_id', 'nationality_name')
check_one_to_one(data, 'nation_team_id', 'nation_position')

In [None]:
from pyspark.sql import DataFrame

def find_url_columns(df: DataFrame):
    # Get a list of column names
    column_names = df.columns 
    
    url_columns = [col for col in column_names if 'url' in col]
    
    return url_columns 

print(f'The columns contain url are {find_url_columns(data)}')

In [None]:
## identify other unuseful columns
useless_cols = ['player_url', 'player_face_url', 'club_logo_url', 
                'club_flag_url', 'nation_logo_url', 'nation_flag_url', 'sofifa_id', 
                'short_name', 'dob', 'club_name','club_jersey_number', 'club_loaned_from', 
                'nationality_name', 'nation_jersey_number', 'body_type','real_face', 'goalkeeping_speed', 
                'club_contract_valid_until']

Now we drop these columns. 

In [None]:
data = data.drop(*useless_cols)

In [None]:
data.show(5)

Now, we have a clean datase, we can start our analysis. #TODO 