In [1]:
### import libraries
import os 
import pandas as pd
from tqdm import tqdm
import numpy as np

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import lit, monotonically_increasing_id

import argparse

####  Help function

In [2]:
# For pandas
# Reducing dataframe memory usage :-
def ReduceMemory(df: pd.DataFrame):
    """
    This function reduces the associated dataframe's memory usage.
    It reassigns the data-types of columns according to their min-max values.
    It also displays the dataframe information after memory reduction.
    """;
    
    # Reducing float column memory usage:-
    for col in tqdm(df.iloc[0:2, 1:].select_dtypes('float').columns):
        col_min = np.amin(df[col].dropna());
        col_max = np.amax(df[col].dropna());
        
        if col_min >= np.finfo(np.float16).min and col_max <= np.finfo(np.float16).max: 
            df[col] = df[col].astype(np.float16)
        elif col_min >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max : 
            df[col] = df[col].astype(np.float32)
        else: pass;

    # Reducing integer column memory usage:-
    for col in tqdm(df.iloc[0:2, 1:].select_dtypes('int').columns):
        col_min = df[col].min(); 
        col_max = df[col].max();
        
        if col_min >= np.iinfo(np.int8).min and col_max <= np.iinfo(np.int8).max:
            df[col] = df[col].astype(np.int8);
        elif col_min >= np.iinfo(np.int16).min and col_max <= np.iinfo(np.int16).max:
            df[col] = df[col].astype(np.int16);
        elif col_min >= np.iinfo(np.int32).min & col_max <= np.iinfo(np.int32).max:
            df[col] = df[col].astype(np.int32);
        else: pass;
        
    display(df.info());
    
    return df;

## Read Data

In [3]:
# init a spark session
appName = "Project1"
master = "local"


sc = SparkSession.builder.appName(appName).getOrCreate()
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession.builder.getOrCreate()

23/11/08 11:11:48 WARN Utils: Your hostname, Dylans-Macbook-Pro-16.local resolves to a loopback address: 127.0.0.1; using 192.168.4.36 instead (on interface en0)
23/11/08 11:11:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/08 11:11:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [49]:
# fifa data folder should contain all the csv files from Fifa(Kaggle), 2015-2022
# assume that you are working in the same directory as the data folder
full_data_path = os.getcwd() + '/full_data.csv'

if not os.path.exists(full_data_path):
    data_path = os.getcwd() + '/fifadata'
    if os.path.exists(data_path):
        print("Data folder exists")
    else:
        print("Data folder does not exist")
        os.makedirs(data_path)
        print("Sussessfully created data folder")

    csv_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.csv')]
    print(csv_files)
    combined_df = None
    for file in csv_files:
        year = file.split("players_")[1].split(".csv")[0]
        df = spark.read.csv(file, header=True, inferSchema=True)
        df = df.withColumn("year", lit(year)) # this is the unique column 'year'
        if combined_df is None:
            combined_df = df
        else:
            combined_df = combined_df.union(df)
    combined_df = combined_df.withColumn("id", monotonically_increasing_id())

    # Write the concatenated DataFrame to a new CSV file
    output_file = "/Users/dylan/DylanLi/Code_Repo/CMU18763_Projects1/full_data.csv"
    ReduceMemory(combined_df.toPandas()).to_csv(output_file)
else: 
    df = spark.read.csv(full_data_path, header=True, inferSchema=True)


In [5]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- sofifa_id: integer (nullable = true)
 |-- player_url: string (nullable = true)
 |-- short_name: string (nullable = true)
 |-- long_name: string (nullable = true)
 |-- player_positions: string (nullable = true)
 |-- overall: integer (nullable = true)
 |-- potential: integer (nullable = true)
 |-- value_eur: double (nullable = true)
 |-- wage_eur: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- dob: date (nullable = true)
 |-- height_cm: integer (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- club_team_id: double (nullable = true)
 |-- club_name: string (nullable = true)
 |-- league_name: string (nullable = true)
 |-- league_level: double (nullable = true)
 |-- club_position: string (nullable = true)
 |-- club_jersey_number: double (nullable = true)
 |-- club_loaned_from: string (nullable = true)
 |-- club_joined: date (nullable = true)
 |-- club_contract_valid_until: double (nullable = true)
 |-- nationalit

## Using pyspark to read table and write to PostgreSQL

In [None]:
db_properties={}
db_properties['username']="postgres"
db_properties['password']="010323"
db_properties['url']= "jdbc:postgresql://localhost:5432/postgres"
db_properties['table']="fifa"
db_properties['driver']="org.postgresql.Driver"

In [None]:
df.write.format("jdbc")\
.mode("overwrite")\
.option("url", db_properties['url'])\
.option("dbtable", db_properties['table'])\
.option("user", db_properties['username'])\
.option("password", db_properties['password'])\
.option("Driver", db_properties['driver'])\
.save()

In [None]:
df_read = sqlContext.read.format("jdbc")\
    .option("url", db_properties['url'])\
    .option("dbtable", db_properties['table'])\
    .option("user", db_properties['username'])\
    .option("password", db_properties['password'])\
    .option("Driver", db_properties['driver'])\
    .load()
    
df_read.show()

In [None]:
df_read.printSchema()

## Task II 

### Question 1
Find X clubs that have the highest number of players with contracts ending in 2023

We find that En Avant de Guingamp have the most players whose contract end in 2023 in FIFA 2022 dataset.

#### In spark SQL

In [None]:
df_read.createOrReplaceTempView("df_view")

In [None]:
sqlWay1 = spark.sql("""
SELECT dv.club_name, COUNT(*) AS player_count
FROM   df_view dv
WHERE  dv.year = 22 AND dv.club_contract_valid_until = 2023
GROUP BY dv.club_name
ORDER BY player_count DESC
LIMIT 10;
                    """)

In [None]:
sqlWay1.show()

#### In original pyspark

In [35]:
from pyspark.sql import functions as F

def find_top_clubs_expiring_contracts(df, num_clubs):

  top_clubs = (df.filter((F.col('year') == '22') & 
                        (F.col('club_contract_valid_until') == 2023))
               .groupBy('club_team_id', 'club_name')
               .agg(F.count('sofifa_id').alias('num_players'))
               .orderBy(F.desc('num_players'))
               .limit(num_clubs))
               
  top_names = [row.club_name for row in top_clubs.collect()]
  
  return top_clubs, top_names

In [36]:
clubs, names = find_top_clubs_expiring_contracts(df_read, 10)

In [37]:
print(f"{names} Club with highest number of players with contracts ending in 2023 in FIFA 2022")

['En Avant de Guingamp', 'Club Atlético Lanús', 'Lechia Gdańsk', 'Kasimpaşa SK', 'Barnsley', 'Bengaluru FC', 'FC Barcelona', 'Zagłębie Lubin', 'SV Wehen Wiesbaden', 'KAA Gent'] Club with highest number of players with contracts ending in 2023 in FIFA 2022


In [38]:
clubs.show()

+------------+--------------------+-----------+
|club_team_id|           club_name|num_players|
+------------+--------------------+-----------+
|        62.0|En Avant de Guingamp|         19|
|    110395.0| Club Atlético Lanús|         17|
|    111091.0|       Lechia Gdańsk|         17|
|    111339.0|        Kasimpaşa SK|         16|
|      1932.0|            Barnsley|         16|
|    113302.0|        Bengaluru FC|         16|
|       241.0|        FC Barcelona|         15|
|    110749.0|      Zagłębie Lubin|         15|
|       492.0|  SV Wehen Wiesbaden|         15|
|       674.0|            KAA Gent|         15|
+------------+--------------------+-----------+



### Question 2 


#### In spark SQL

In [None]:
sqlWay2 = spark.sql("""
WITH club_counts AS (
  SELECT 
    dv.club_name, 
    dv.year, 
    COUNT(*) AS player_count
  FROM df_view dv
  WHERE dv.age > 27
  GROUP BY dv.club_name, dv.year
),

club_averages AS (
  SELECT 
    cc.club_name,
    AVG(cc.player_count) AS avg_player_count
  FROM club_counts cc
  GROUP BY cc.club_name  
)

SELECT
  ca.club_name,
  ca.avg_player_count AS average_count
FROM club_averages ca
WHERE (
  SELECT COUNT(*) 
  FROM club_averages ca2 
  WHERE ca2.avg_player_count > ca.avg_player_count
) < 10
ORDER BY ca.avg_player_count DESC;
""")

In [None]:
sqlWay2.show()

#### In original pyspark

In [52]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def find_top_clubs_by_avg_older_players(df, num_clubs):

  window = Window.orderBy(F.desc('avg_count'))
  df = df.repartition(100)
  top_clubs = (df.filter(F.col('age') > 27)
               .groupBy('year', 'club_team_id', 'club_name')
               .agg(F.count('sofifa_id').alias('count'))
               .groupBy('club_team_id', 'club_name')
               .agg(F.avg('count').alias('avg_count'))
               .withColumn('rank', F.rank().over(window))
               .filter(F.col('rank') <= num_clubs)
               .select('club_team_id', 'club_name'))
               
  top_names = [row.club_name for row in top_clubs.collect()]



  return top_clubs, top_names

In [53]:
clubs, names = find_top_clubs_by_avg_older_players(df_read, 10)

23/10/12 20:51:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 2

In [54]:
clubs.show()

23/10/12 20:51:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.

+------------+--------------------+
|club_team_id|           club_name|
+------------+--------------------+
|        null|                null|
|    111032.0|  Dorados de Sinaloa|
|    113158.0| Matsumoto Yamaga FC|
|    110955.0| Shanghai Shenhua FC|
|    112961.0|          Qingdao FC|
|    110974.0|Club Deportivo Jo...|
|    101006.0|            Altay SK|
|    114688.0|         Guaireña FC|
|    101014.0|İstanbul Başakşeh...|
|    114511.0|      Sport Huancayo|
|       749.0|      BB Erzurumspor|
|    101108.0|        Club Olimpia|
+------------+--------------------+



23/10/12 20:51:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/12 20:51:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
          

In [55]:
print(f"{names} clubs with highest avg players over 27:")

[None, 'Dorados de Sinaloa', 'Matsumoto Yamaga FC', 'Shanghai Shenhua FC', 'Qingdao FC', 'Club Deportivo Jorge Wilstermann', 'Altay SK', 'Guaireña FC', 'İstanbul Başakşehir FK', 'Sport Huancayo', 'BB Erzurumspor', 'Club Olimpia'] clubs with highest avg players over 27:


### Question 3

#### In spark SQL

In [None]:
sqlWay3 = spark.sql("""
WITH yearly_counts AS (
    SELECT dv.year, dv.nation_position, COUNT(*) AS position_count
    FROM df_view dv
    WHERE dv.nation_position IS NOT NULL
    GROUP BY dv.year, dv.nation_position
),
max_counts AS (
    SELECT yc.year, MAX(yc.position_count) AS max_count
    FROM yearly_counts yc
    GROUP BY yc.year
)
SELECT mc.year, yc.nation_position, mc.max_count
FROM max_counts mc
JOIN yearly_counts yc ON mc.year = yc.year AND mc.max_count = yc.position_count
ORDER BY mc.year;

""")

In [None]:
sqlWay3.show()

#### In original pyspark

In [58]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def most_frequent_nation_position(df):

  window = Window.partitionBy('year').orderBy(F.desc('count'))

  top_positions = (df.filter(F.col('nation_position').isNotNull())
                  .groupBy('year', 'nation_position')
                  .agg(F.count('sofifa_id').alias('count'))
                  .withColumn('rank', F.rank().over(window))
                  .filter(F.col('rank') == 1)
                  .select('year', 'nation_position'))

  return top_positions

In [59]:
most_frequent_nation_position(df_read).show()

+----+---------------+
|year|nation_position|
+----+---------------+
|  15|            SUB|
|  16|            SUB|
|  17|            SUB|
|  18|            SUB|
|  19|            SUB|
|  20|            SUB|
|  21|            SUB|
|  22|            SUB|
+----+---------------+



#### Data Preprocessing

##### Drop Useless column

In [131]:
# Drop useless column
useless_cols = ['player_url', 'player_face_url', 'club_logo_url', 
                'club_flag_url', 'nation_logo_url', 'nation_flag_url', 'sofifa_id', 
                'short_name', 'dob', 'club_name','club_jersey_number', 'club_loaned_from', 
                'nationality_name', 'nation_jersey_number', 'body_type','real_face', 'goalkeeping_speed', 
                'club_contract_valid_until', 'nation_team_id', 'nation_position', 'player_tags', 'player_traits', 'release_clause_eur', 'long_name']

In [132]:

# new_df = df_read.drop(*useless_columns)
new_df = df.drop(*useless_cols)

In [133]:

from pyspark.sql import DataFrame
from pyspark.sql.functions import col, sum as _sum, when

na_counts = new_df.select([_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in new_df.columns])

In [134]:
na_counts.show()

23/11/08 15:24:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , long_name, player_positions, overall, potential, value_eur, wage_eur, age, height_cm, weight_kg, club_team_id, league_name, league_level, club_position, club_joined, nationality_id, preferred_foot, weak_foot, skill_moves, international_reputation, work_rate, pace, shooting, passing, dribbling, defending, physic, attacking_crossing, attacking_finishing, attacking_heading_accuracy, attacking_short_passing, attacking_volleys, skill_dribbling, skill_curve, skill_fk_accuracy, skill_long_passing, skill_ball_control, movement_acceleration, movement_sprint_speed, movement_agility, movement_reactions, movement_balance, power_shot_power, power_jumping, power_stamina, power_strength, power_long_shots, mentality_aggression, mentality_interceptions, mentality_positioning, mentality_vision, mentality_penalties, mentality_composure, defending_marking_awareness, defending_standing_tackle, defending_sliding_t

+---+---------+----------------+-------+---------+---------+--------+---+---------+---------+------------+-----------+------------+-------------+-----------+--------------+--------------+---------+-----------+------------------------+---------+-----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+---+---+---+---+---+---+---+---+

##### drop value after + or -


In [135]:
from pyspark.sql.functions import split
from pyspark.sql.types import IntegerType

columns1 = ['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram',
            'lm','lcm','cm','rcm','rm','lwb','ldm', 'cdm','rdm','rwb',
            'lb','lcb','cb','rcb','rb', 'gk']

for col in columns1:
    new_df = new_df.withColumn(col, split(new_df[col], r'\+|-').getItem(0).cast(IntegerType()))

In [136]:
new_df.show(5)

+---+--------------------+----------------+-------+---------+---------+--------+---+---------+---------+------------+--------------------+------------+-------------+-----------+--------------+--------------+---------+-----------+------------------------+-----------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+---+---+---

23/11/08 15:24:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , long_name, player_positions, overall, potential, value_eur, wage_eur, age, height_cm, weight_kg, club_team_id, league_name, league_level, club_position, club_joined, nationality_id, preferred_foot, weak_foot, skill_moves, international_reputation, work_rate, pace, shooting, passing, dribbling, defending, physic, attacking_crossing, attacking_finishing, attacking_heading_accuracy, attacking_short_passing, attacking_volleys, skill_dribbling, skill_curve, skill_fk_accuracy, skill_long_passing, skill_ball_control, movement_acceleration, movement_sprint_speed, movement_agility, movement_reactions, movement_balance, power_shot_power, power_jumping, power_stamina, power_strength, power_long_shots, mentality_aggression, mentality_interceptions, mentality_positioning, mentality_vision, mentality_penalties, mentality_composure, defending_marking_awareness, defending_standing_tackle, defending_sliding_t

In [137]:
(new_df.count(), len(new_df.columns))

(142079, 90)

### Handling Missing Value

From insights from EDA, we know: 

- Some cols' missing pattern is the same.
  - 'passing', 'pace', 'physic', 'defending','dribbling', 'shooting'
  - 'league_level', 'value_eur', 'club_team_id', 'wage_eur', 'league_name', 'club_position'

Some cols has it's unique pattern
  - mentality_composure
  - gk
  - club_joined

So, for group missing value, we define that it is MAR. For unique missing value, we define that it is MCAR.

- For MAR, we use a linear regreesion to impute numerical value because we find that the missing pattern is the same and there is a high correlation between the missing value and other cols. We use KNN to impute categorical value. 
- For MCAR, we use mean to impute numerical value and use mode to impute categorical value.

#### MCAR

In [138]:
from pyspark.ml.feature import Imputer
# For numerical columns
imputer = Imputer(inputCols=['mentality_composure', 'gk'], 
                  outputCols=["{}_imputed".format(c) for c in ['mentality_composure', 'gk']])
model = imputer.fit(new_df)
new_df = model.transform(new_df)
new_df = new_df.drop('mentality_composure', 'gk')


In [139]:

# For categorical columns, you can calculate the mode and fill in the missing values
from pyspark.sql.functions import when, lit

mode = new_df.groupBy('club_joined').count().orderBy('count', ascending=False).first()[0]
new_df = new_df.withColumn('club_joined', when(df['club_joined'].isNull(), lit(mode)).otherwise(new_df['club_joined']))

In [140]:
new_df = new_df.drop('_c0')

#### MAR

In [141]:
new_df.select(['league_level', 'value_eur', 'club_team_id', 'wage_eur', 'league_name', 'club_position']).show()

+------------+---------+------------+--------+--------------------+-------------+
|league_level|value_eur|club_team_id|wage_eur|         league_name|club_position|
+------------+---------+------------+--------+--------------------+-------------+
|         1.0|    7.8E7|        73.0|320000.0|      French Ligue 1|           RW|
|         1.0|  1.195E8|        21.0|270000.0|German 1. Bundesliga|           ST|
|         1.0|    4.5E7|        11.0|270000.0|English Premier L...|           ST|
|         1.0|   1.29E8|        73.0|270000.0|      French Ligue 1|           LW|
|         1.0|  1.255E8|        10.0|350000.0|English Premier L...|          RCM|
|         1.0|   1.12E8|       240.0|130000.0|Spain Primera Div...|           GK|
|         1.0|   1.94E8|        73.0|230000.0|      French Ligue 1|           ST|
|         1.0|   1.35E7|        21.0| 86000.0|German 1. Bundesliga|           GK|
|         1.0|    9.9E7|       241.0|250000.0|Spain Primera Div...|           GK|
|         1.0|  

In [142]:
from pyspark.ml.feature import StringIndexer
for col in ['club_position', 'league_name']: 
    indexer = StringIndexer(inputCol=col, outputCol=col+"_index")
    new_df = indexer.fit(new_df).transform(new_df)
    new_df = new_df.drop(col)

In [154]:
feature_cols = [ 'potential', 'value_eur', 'wage_eur', 'age', 'height_cm', 'weight_kg', 'club_team_id', 'league_level', 'nationality_id', 'weak_foot', 'skill_moves', 'international_reputation', 'pace', 'shooting', 'passing', 'dribbling', 
                'defending', 'physic', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 
                'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 
                'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 
                'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 
                'year', 'mentality_composure_imputed', 'gk_imputed', 'club_position_index', 'league_name_index', 'Position_GK', 'Position_RW', 'Position_LW', 'Position_ST', 'Position_RB', 'Position_RWB', 'Position_CM', 'Position_LWB', 
                'Position_LM', 'Position_CDM', 'Position_LB', 'Position_CB', 'Position_CF', 'Position_RM', 'Position_CAM', 'work_rate_index', 'preferred_foot_index']

In [161]:
target_col = ['overall']

To impute

In [157]:
from pyspark.sql.functions import col, count, when
new_df = new_df.select(feature_cols)
null_counts = new_df.select([count(when(col(c).isNull(), 1)).alias(c) for c in new_df.columns]).collect()[0].asDict()
missing_cols = [k for k, v in null_counts.items() if v > 0]
non_missing_cols = [k for k, v in null_counts.items() if v == 0]

                                                                                

In [158]:
missing_cols

['value_eur',
 'wage_eur',
 'club_team_id',
 'league_level',
 'pace',
 'shooting',
 'passing',
 'dribbling',
 'defending',
 'physic']

In [159]:
non_missing_cols

['potential',
 'age',
 'height_cm',
 'weight_kg',
 'nationality_id',
 'weak_foot',
 'skill_moves',
 'international_reputation',
 'attacking_crossing',
 'attacking_finishing',
 'attacking_heading_accuracy',
 'attacking_short_passing',
 'attacking_volleys',
 'skill_dribbling',
 'skill_curve',
 'skill_fk_accuracy',
 'skill_long_passing',
 'skill_ball_control',
 'movement_acceleration',
 'movement_sprint_speed',
 'movement_agility',
 'movement_reactions',
 'movement_balance',
 'power_shot_power',
 'power_jumping',
 'power_stamina',
 'power_strength',
 'power_long_shots',
 'mentality_aggression',
 'mentality_interceptions',
 'mentality_positioning',
 'mentality_vision',
 'mentality_penalties',
 'defending_marking_awareness',
 'defending_standing_tackle',
 'defending_sliding_tackle',
 'goalkeeping_diving',
 'goalkeeping_handling',
 'goalkeeping_kicking',
 'goalkeeping_positioning',
 'goalkeeping_reflexes',
 'ls',
 'st',
 'rs',
 'lw',
 'lf',
 'cf',
 'rf',
 'rw',
 'lam',
 'cam',
 'ram',
 'lm',

In [162]:
new_df.show()

+---------+---------+--------+---+---------+---------+------------+------------+--------------+---------+-----------+------------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---------------------------+----------+-------

Transform and feature engineering non-missing cols

In [163]:
from pyspark.sql.functions import split, when, col, array_contains
import itertools

# Split positions into array
split_positions = split(new_df['player_positions'], ', ')  

# Get distinct positions as a list
distinct_positions = list(set(list(itertools.chain(*new_df.select(split_positions.alias('positions')).distinct().rdd.flatMap(lambda x: x).collect()))))

# Create a column for each distinct position
for position in distinct_positions:
  new_df = new_df.withColumn(
    'Position_' + position,
     when(array_contains(split_positions, position), 1).otherwise(0)
  )




AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `player_positions` cannot be resolved. Did you mean one of the following? [`potential`, `value_eur`, `wage_eur`, `age`, `height_cm`, `weight_kg`, `club_team_id`, `league_level`, `nationality_id`, `weak_foot`, `skill_moves`, `international_reputation`, `pace`, `shooting`, `passing`, `dribbling`, `defending`, `physic`, `attacking_crossing`, `attacking_finishing`, `attacking_heading_accuracy`, `attacking_short_passing`, `attacking_volleys`, `skill_dribbling`, `skill_curve`, `skill_fk_accuracy`, `skill_long_passing`, `skill_ball_control`, `movement_acceleration`, `movement_sprint_speed`, `movement_agility`, `movement_reactions`, `movement_balance`, `power_shot_power`, `power_jumping`, `power_stamina`, `power_strength`, `power_long_shots`, `mentality_aggression`, `mentality_interceptions`, `mentality_positioning`, `mentality_vision`, `mentality_penalties`, `defending_marking_awareness`, `defending_standing_tackle`, `defending_sliding_tackle`, `goalkeeping_diving`, `goalkeeping_handling`, `goalkeeping_kicking`, `goalkeeping_positioning`, `goalkeeping_reflexes`, `ls`, `st`, `rs`, `lw`, `lf`, `cf`, `rf`, `rw`, `lam`, `cam`, `ram`, `lm`, `lcm`, `cm`, `rcm`, `rm`, `lwb`, `ldm`, `cdm`, `rdm`, `rwb`, `lb`, `lcb`, `cb`, `rcb`, `rb`, `year`, `mentality_composure_imputed`, `gk_imputed`, `club_position_index`, `league_name_index`, `Position_GK`, `Position_RW`, `Position_LW`, `Position_ST`, `Position_RB`, `Position_RWB`, `Position_CM`, `Position_LWB`, `Position_LM`, `Position_CDM`, `Position_LB`, `Position_CB`, `Position_CF`, `Position_RM`, `Position_CAM`, `work_rate_index`, `preferred_foot_index`].

In [164]:
new_df = new_df.drop('player_positions')

In [165]:
new_df.show()

+---------+---------+--------+---+---------+---------+------------+------------+--------------+---------+-----------+------------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---------------------------+----------+-------

Transforme string cols 


In [150]:
_ = ['work_rate', 'preferred_foot']
for column in _: 
    index = StringIndexer(inputCol=column, outputCol=column+"_index")
    new_df = index.fit(new_df).transform(new_df)
    new_df = new_df.drop(column)
    

In [151]:
new_df.show()

+--------------------+-------+---------+---------+--------+---+---------+---------+------------+------------+-----------+--------------+---------+-----------+------------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+-

In [167]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
for missing_col in missing_cols: 
    train_df = new_df.select(non_missing_cols + [missing_col])
    df_missing = train_df.where(col(missing_col).isNull())
    df_no_missing = train_df.where(col(missing_col).isNotNull())
    assembler = VectorAssembler(inputCols=non_missing_cols, outputCol="features")
    df_no_missing = assembler.transform(df_no_missing)
    lr = LinearRegression(featuresCol = "features", labelCol=missing_col)
    lr_model = lr.fit(df_no_missing)
    df_missing = assembler.transform(df_missing)
    predictions = lr_model.transform(df_missing)
    break

23/11/08 15:56:14 WARN Instrumentation: [f3697cc7] regParam is zero, which might cause numerical instability and overfitting.
23/11/08 15:56:14 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/11/08 15:56:14 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
23/11/08 15:56:14 ERROR Executor: Exception in task 3.0 in stage 1297.0 (TID 4695)
org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (StringIndexerModel$$Lambda$4328/0x0000000801c47000: (string) => double).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:217)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.Buffe

Py4JJavaError: An error occurred while calling o17655.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 1297.0 failed 1 times, most recent failure: Lost task 5.0 in stage 1297.0 (TID 4697) (192.168.4.36 executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (StringIndexerModel$$Lambda$4328/0x0000000801c47000: (string) => double).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:217)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$4(RDD.scala:1234)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$6(RDD.scala:1235)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:853)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:853)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.SparkException: StringIndexer encountered NULL value. To handle or skip NULLS, try setting StringIndexer.handleInvalid.
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1(StringIndexer.scala:396)
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1$adapted(StringIndexer.scala:391)
	... 36 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2358)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1172)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1166)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$2(RDD.scala:1259)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1226)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1212)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1212)
	at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:107)
	at org.apache.spark.ml.regression.LinearRegression.trainWithNormal(LinearRegression.scala:456)
	at org.apache.spark.ml.regression.LinearRegression.$anonfun$train$1(LinearRegression.scala:354)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:329)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:186)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:78)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (StringIndexerModel$$Lambda$4328/0x0000000801c47000: (string) => double).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:217)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$4(RDD.scala:1234)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$6(RDD.scala:1235)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:853)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:853)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
Caused by: org.apache.spark.SparkException: StringIndexer encountered NULL value. To handle or skip NULLS, try setting StringIndexer.handleInvalid.
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1(StringIndexer.scala:396)
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1$adapted(StringIndexer.scala:391)
	... 36 more


In [168]:
df_no_missing.show()

+---------+---+---------+---------+--------------+---------+-----------+------------------------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---------------------------+----------+-------------------+-----------------+-----------+-----------+-----------+-----------+-----------+---

In [22]:
#Check missing values again
na_counts = new_df.select([_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in new_df.columns])
na_counts.show()

23/11/08 11:12:58 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , overall, potential, age, height_cm, weight_kg, club_joined, nationality_id, weak_foot, skill_moves, international_reputation, attacking_crossing, attacking_finishing, attacking_heading_accuracy, attacking_short_passing, attacking_volleys, skill_dribbling, skill_curve, skill_fk_accuracy, skill_long_passing, skill_ball_control, movement_acceleration, movement_sprint_speed, movement_agility, movement_reactions, movement_balance, power_shot_power, power_jumping, power_stamina, power_strength, power_long_shots, mentality_aggression, mentality_interceptions, mentality_positioning, mentality_vision, mentality_penalties, mentality_composure, defending_marking_awareness, defending_standing_tackle, defending_sliding_tackle, goalkeeping_diving, goalkeeping_handling, goalkeeping_kicking, goalkeeping_positioning, goalkeeping_reflexes, ls, st, rs, lw, lf, cf, rf, rw, lam, cam, ram, lm, lcm, cm, rcm, rm, lw

+---+---------+-------+---------+---------+--------+---+---------+---------+------------+-----------+------------+-------------+-----------+--------------+--------------+---------+-----------+------------------------+---------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+---+---+---+---+---+---+---+---+---+---+---+---+--

                                                                                

In [23]:
# We are gonna preprocess the preffered_foot using one-hot encoder
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
indexer = StringIndexer(inputCol='preferred_foot', outputCol='indexed_preferred_foot')
encoder = OneHotEncoder(inputCols=['indexed_preferred_foot'], outputCols=['preferred_foot_encoded'])

pipeline = Pipeline(stages=[indexer,encoder])

model = pipeline.fit(new_df)

col_to_drop = ['indexed_preferred_foot','preferred_foot']
data_encoded = model.transform(new_df).drop(*col_to_drop)

In [None]:
# use label encoder for work_rate and player_positions label_encoder
from pyspark.ml.feature import StringIndexer

# Loop over each string column in the DataFrame
for col_name, data_type in data_encoded.dtypes:
    if data_type == 'string':
        # Create a StringIndexer object and fit it to the column
        indexer = StringIndexer(inputCol=col_name, outputCol=col_name + '_indexed')
        model = indexer.fit(data_encoded)
        
        # Transform the column using the fitted indexer
        data_encoded = model.transform(data_encoded).drop(col_name)

In [None]:
data_encoded.show(5)

In [None]:
data_encoded.printSchema()