In [1]:
### import libraries
import os 
import pandas as pd
from tqdm import tqdm
import numpy as np

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext
import pyspark.sql.functions as F 
from pprint import pprint

In [2]:
# init a spark session
appName = "Fifa_DE"
master = "local"


sc = SparkSession.builder.appName(appName).getOrCreate()
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession.builder.getOrCreate()

23/10/31 18:18:53 WARN Utils: Your hostname, dylanli3090 resolves to a loopback address: 127.0.1.1; using 100.110.155.221 instead (on interface wlp37s0)
23/10/31 18:18:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/31 18:18:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
useless_cols = ['player_url', 'player_face_url', 'club_logo_url', 
                'club_flag_url', 'nation_logo_url', 'nation_flag_url', 'sofifa_id', 
                'short_name', 'dob', 'club_name','club_jersey_number', 'club_loaned_from', 
                'nationality_name', 'nation_jersey_number', 'body_type','real_face', 'goalkeeping_speed', 
                'club_contract_valid_until']

From Data EDA, We know that

- We will drop some unnecessary columns
- We should clean the data
- We should deal with `club_joined`, `player_traits`, and `tags` columns.

In [4]:
# Read data 
data_path = os.getcwd() + '/full_data.csv'
data = spark.read.csv(data_path, header=True, inferSchema=True)

                                                                                

In [5]:
data = data.drop(*useless_cols)

### Clean Data

In [6]:
##### Drop Columns that Missing Value are more than 50%
def missing_value_col(df):
    cols_to_drop = []
    for i in tqdm(df.columns):
        missing = df.filter(F.col(i).isNull()).count() / df.count() * 100
        if missing > 50:
            print('{} - {}%'.format(i, round(missing)))
            cols_to_drop.append(i)
    return cols_to_drop

In [7]:
data = data.drop(*missing_value_col(data))

  0%|          | 0/95 [00:00<?, ?it/s]

23/10/31 18:19:03 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///home/dylan/repo/CMU18763_Projects1/full_data.csv
 19%|█▉        | 18/95 [00:04<00:11,  6.51it/s]

nation_team_id - 94%
nation_position - 94%


 27%|██▋       | 26/95 [00:05<00:10,  6.85it/s]

player_tags - 92%
player_traits - 55%


100%|██████████| 95/95 [00:14<00:00,  6.57it/s]


In [8]:
na_counts = data.select([F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in data.columns])
na_counts.show()

23/10/31 18:19:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/10/31 18:19:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , long_name, player_positions, overall, potential, value_eur, wage_eur, age, height_cm, weight_kg, club_team_id, league_name, league_level, club_position, club_joined, nationality_id, preferred_foot, weak_foot, skill_moves, international_reputation, work_rate, release_clause_eur, pace, shooting, passing, dribbling, defending, physic, attacking_crossing, attacking_finishing, attacking_heading_accuracy, attacking_short_passing, attacking_volleys, skill_dribbling, skill_curve, skill_fk_accuracy, skill_long_passing, skill_ball_control, movement_acceleration, movement_sprint_speed, movement_agility, movement_reactions, movement_balance, power_shot_power, power_jumping, power_stamina, power_strength, power_long_sho

+---+---------+----------------+-------+---------+---------+--------+---+---------+---------+------------+-----------+------------+-------------+-----------+--------------+--------------+---------+-----------+------------------------+---------+------------------+-----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+---+---+---+-

In [9]:
data.show(5)

+---+--------------------+----------------+-------+---------+---------+--------+---+---------+---------+------------+--------------------+------------+-------------+-----------+--------------+--------------+---------+-----------+------------------------+-----------+------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+-------------

23/10/31 18:19:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , long_name, player_positions, overall, potential, value_eur, wage_eur, age, height_cm, weight_kg, club_team_id, league_name, league_level, club_position, club_joined, nationality_id, preferred_foot, weak_foot, skill_moves, international_reputation, work_rate, release_clause_eur, pace, shooting, passing, dribbling, defending, physic, attacking_crossing, attacking_finishing, attacking_heading_accuracy, attacking_short_passing, attacking_volleys, skill_dribbling, skill_curve, skill_fk_accuracy, skill_long_passing, skill_ball_control, movement_acceleration, movement_sprint_speed, movement_agility, movement_reactions, movement_balance, power_shot_power, power_jumping, power_stamina, power_strength, power_long_shots, mentality_aggression, mentality_interceptions, mentality_positioning, mentality_vision, mentality_penalties, mentality_composure, defending_marking_awareness, defending_standing_tackle,

- now we only need deal with `club_joined`
- Then we should deal with some columns that have less missing values. We will use `fillna` method to fill the missing values.

For string cols, we should fill with `NA` or `0` for numeric cols.

In [10]:
data = data.fillna(0)
na_value = "NA"
string_cols = [c for c, t in data.dtypes if t == 'string']
for col in tqdm(string_cols):
    data = data.fillna(na_value, subset=[col])

  0%|          | 0/33 [00:00<?, ?it/s]

100%|██████████| 33/33 [00:00<00:00, 94.50it/s] 


In [11]:
data.show(5)

23/10/31 18:19:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , long_name, player_positions, overall, potential, value_eur, wage_eur, age, height_cm, weight_kg, club_team_id, league_name, league_level, club_position, club_joined, nationality_id, preferred_foot, weak_foot, skill_moves, international_reputation, work_rate, release_clause_eur, pace, shooting, passing, dribbling, defending, physic, attacking_crossing, attacking_finishing, attacking_heading_accuracy, attacking_short_passing, attacking_volleys, skill_dribbling, skill_curve, skill_fk_accuracy, skill_long_passing, skill_ball_control, movement_acceleration, movement_sprint_speed, movement_agility, movement_reactions, movement_balance, power_shot_power, power_jumping, power_stamina, power_strength, power_long_shots, mentality_aggression, mentality_interceptions, mentality_positioning, mentality_vision, mentality_penalties, mentality_composure, defending_marking_awareness, defending_standing_tackle,

+---+--------------------+----------------+-------+---------+---------+--------+---+---------+---------+------------+--------------------+------------+-------------+-----------+--------------+--------------+---------+-----------+------------------------+-----------+------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+-------------

For ls, st, rs ..... , we should drop value after + |

In [12]:
from pyspark.sql.types import IntegerType

In [13]:
columns1 = ['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram',
            'lm','lcm','cm','rcm','rm','lwb','ldm', 'cdm','rdm','rwb',
            'lb','lcb','cb','rcb','rb','gk']

for col in columns1:
    data = data.withColumn(col, F.split(data[col], '\+').getItem(0).cast(IntegerType()))

In [14]:
data.show(5)

+---+--------------------+----------------+-------+---------+---------+--------+---+---------+---------+------------+--------------------+------------+-------------+-----------+--------------+--------------+---------+-----------+------------------------+-----------+------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+-------------

23/10/31 18:19:20 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , long_name, player_positions, overall, potential, value_eur, wage_eur, age, height_cm, weight_kg, club_team_id, league_name, league_level, club_position, club_joined, nationality_id, preferred_foot, weak_foot, skill_moves, international_reputation, work_rate, release_clause_eur, pace, shooting, passing, dribbling, defending, physic, attacking_crossing, attacking_finishing, attacking_heading_accuracy, attacking_short_passing, attacking_volleys, skill_dribbling, skill_curve, skill_fk_accuracy, skill_long_passing, skill_ball_control, movement_acceleration, movement_sprint_speed, movement_agility, movement_reactions, movement_balance, power_shot_power, power_jumping, power_stamina, power_strength, power_long_shots, mentality_aggression, mentality_interceptions, mentality_positioning, mentality_vision, mentality_penalties, mentality_composure, defending_marking_awareness, defending_standing_tackle,

For `player_postion`, we create as a new column `player_position_class` to classify the player position.

In [15]:
from pyspark.sql.functions import split, when, col, array_contains
import itertools

# Split positions into array
split_positions = split(data['player_positions'], ', ')  

# Get distinct positions as a list
distinct_positions = list(set(list(itertools.chain(*data.select(split_positions.alias('positions')).distinct().rdd.flatMap(lambda x: x).collect()))))

# Create a column for each distinct position
for position in distinct_positions:
  data = data.withColumn(
    'Position_' + position,
     when(array_contains(split_positions, position), 1).otherwise(0)
  )


In [16]:
data = data.drop('player_positions')

In [17]:
data = data.drop('_c0')

In [21]:
# to csv file
data.coalesce(1).write.option('header', 'true').csv('fifa_data.csv')

23/10/31 18:25:32 ERROR RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks (after 2 retries)
java.io.IOException: Connecting to /100.110.155.221:40605 failed in the last 4750 ms, fail this connection directly
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:210)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:131)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:173)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.lambda$initiateRetry$0(RetryingBlockTransferor.java:206)
	at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExe

KeyboardInterrupt: 