In [None]:
### import libraries
import os 
import pandas as pd
from tqdm import tqdm
import numpy as np

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext
import pyspark.sql.functions as F 
from pprint import pprint

In [None]:
# init a spark session
appName = "Fifa_DE"
master = "local"


sc = SparkSession.builder.appName(appName).getOrCreate()
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession.builder.getOrCreate()

In [None]:
useless_cols = ['player_url', 'player_face_url', 'club_logo_url', 
                'club_flag_url', 'nation_logo_url', 'nation_flag_url', 'sofifa_id', 
                'short_name', 'dob', 'club_name','club_jersey_number', 'club_loaned_from', 
                'nationality_name', 'nation_jersey_number', 'body_type','real_face', 'goalkeeping_speed', 
                'club_contract_valid_until']

From Data EDA, We know that

- We will drop some unnecessary columns
- We should clean the data
- We should deal with `club_joined`, `player_traits`, and `tags` columns.

In [None]:
# Read data 
data_path = '/Users/dylan/DylanLi/Code_Repo/CMU18763_Projects1/full_data.csv'
data = spark.read.csv(data_path, header=True, inferSchema=True)

In [None]:
data = data.drop(*useless_cols)

### Clean Data

In [None]:
##### Drop Columns that Missing Value are more than 50%
def missing_value_col(df):
    cols_to_drop = []
    for i in tqdm(df.columns):
        missing = df.filter(F.col(i).isNull()).count() / df.count() * 100
        if missing > 50:
            print('{} - {}%'.format(i, round(missing)))
            cols_to_drop.append(i)
    return cols_to_drop

In [None]:
data = data.drop(*missing_value_col(data))

In [None]:
na_counts = data.select([F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in data.columns])
na_counts.show()

In [None]:
data.show(5)

- now we only need deal with `club_joined`
- Then we should deal with some columns that have less missing values. We will use `fillna` method to fill the missing values.

For string cols, we should fill with `NA` or `0` for numeric cols.

In [None]:
data = data.fillna(0)
na_value = "NA"
string_cols = [c for c, t in data.dtypes if t == 'string']
for col in tqdm(string_cols):
    data = data.fillna(na_value, subset=[col])

In [None]:
data.show(5)