<a href="https://colab.research.google.com/github/Codilis/Pyspark-Projects/blob/master/FIFA2017PlayersData/FIFA_2017_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data Source : https://www.kaggle.com/datasets/artimous/complete-fifa-2017-player-dataset-global/data
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=ac120cdd7782be260eaf71da962f8a2141f0dad6e88266b2ea0b01b20eb37462
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from collections import defaultdict

In [3]:
#Create my_spark
spark = SparkSession.builder.getOrCreate()

In [4]:
github_url = "https://raw.githubusercontent.com/Codilis/Pyspark-Projects/master/FIFA2017PlayersData/players_data.csv"
pd_df = pd.read_csv(github_url)
pd_df.head()

Unnamed: 0,Name,Nationality,National_Position,National_Kit,Club,Club_Position,Club_Kit,Club_Joining,Contract_Expiry,Rating,...,Long_Shots,Curve,Freekick_Accuracy,Penalties,Volleys,GK_Positioning,GK_Diving,GK_Kicking,GK_Handling,GK_Reflexes
0,Cristiano Ronaldo,Portugal,LS,7.0,Real Madrid,LW,7.0,07/01/2009,2021.0,94,...,90,81,76,85,88,14,7,15,11,11
1,Lionel Messi,Argentina,RW,10.0,FC Barcelona,RW,10.0,07/01/2004,2018.0,93,...,88,89,90,74,85,14,6,15,11,8
2,Neymar,Brazil,LW,10.0,FC Barcelona,LW,11.0,07/01/2013,2021.0,92,...,77,79,84,81,83,15,9,15,9,11
3,Luis Suárez,Uruguay,LS,9.0,FC Barcelona,ST,9.0,07/11/2014,2021.0,92,...,86,86,84,85,88,33,27,31,25,37
4,Manuel Neuer,Germany,GK,1.0,FC Bayern,GK,1.0,07/01/2011,2021.0,92,...,16,14,11,47,11,91,89,95,90,89


In [35]:
spark_df_unprocessed = spark.createDataFrame(pd_df)
spark_df_unprocessed.limit(5).show()

+-----------------+-----------+-----------------+------------+------------+-------------+--------+------------+---------------+------+------+------+--------------+----------+---+------------------+---------------+---------+-----------+------------+---------+-------+--------------+---------------+----------+---------+------------------+-------------+------+---------+--------+----------+---------+------------+-----+-------+--------+-------+-------+-------+-------+----------+---------+----------+-----+-----------------+---------+-------+--------------+---------+----------+-----------+-----------+
|             Name|Nationality|National_Position|National_Kit|        Club|Club_Position|Club_Kit|Club_Joining|Contract_Expiry|Rating|Height|Weight|Preffered_Foot|Birth_Date|Age|Preffered_Position|      Work_Rate|Weak_foot|Skill_Moves|Ball_Control|Dribbling|Marking|Sliding_Tackle|Standing_Tackle|Aggression|Reactions|Attacking_Position|Interceptions|Vision|Composure|Crossing|Short_Pass|Long_Pass|A

## Pre Processing

In [37]:
schema = IntegerType()
spark_df = spark_df_unprocessed.withColumn('Height', F.translate('Height', ' cm', '').cast(schema))
spark_df = spark_df.withColumn('Weight', F.translate('Weight', ' kg', '').cast(schema))
spark_df = spark_df.withColumn('Birth_Date', F.to_date('Birth_Date', 'MM/dd/yyyy'))
spark_df = spark_df.withColumn('Club_Joining', F.to_date('Club_Joining', 'MM/dd/yyyy'))

col_list = ['Name', 'Nationality', 'National_Position', 'Club', 'Club_Position', 'Preffered_Foot', 'Preffered_Position', 'Work_Rate']
for col in col_list:
  spark_df = spark_df.withColumn(col, F.lower(F.trim(col)))
spark_df.limit(5).show()

+-----------------+-----------+-----------------+------------+------------+-------------+--------+------------+---------------+------+------+------+--------------+----------+---+------------------+---------------+---------+-----------+------------+---------+-------+--------------+---------------+----------+---------+------------------+-------------+------+---------+--------+----------+---------+------------+-----+-------+--------+-------+-------+-------+-------+----------+---------+----------+-----+-----------------+---------+-------+--------------+---------+----------+-----------+-----------+
|             Name|Nationality|National_Position|National_Kit|        Club|Club_Position|Club_Kit|Club_Joining|Contract_Expiry|Rating|Height|Weight|Preffered_Foot|Birth_Date|Age|Preffered_Position|      Work_Rate|Weak_foot|Skill_Moves|Ball_Control|Dribbling|Marking|Sliding_Tackle|Standing_Tackle|Aggression|Reactions|Attacking_Position|Interceptions|Vision|Composure|Crossing|Short_Pass|Long_Pass|A

In [38]:
(spark_df.select(F.col('Nationality')).distinct().count(),
spark_df.select(F.col('National_Position')).distinct().count(),
spark_df.select(F.col('Work_Rate')).distinct().count(),
spark_df.select(F.col('Club')).distinct().count(),
spark_df.select(F.col('Rating')).distinct().count())

(160, 28, 9, 634, 49)

## Data Analysis

In [40]:
# Nationality wise player count
player_count_by_national = spark_df.groupBy(F.col('Nationality')).count().orderBy(F.col('count'), ascending=False)
player_count_by_club = spark_df.groupBy(F.col('Club')).count().orderBy(F.col('count'), ascending=False)

player_count_by_club.show()

+---------------+-----+
|           Club|count|
+---------------+-----+
|    free agents|  232|
|   hamburger sv|   33|
| leicester city|   33|
|manchester city|   33|
|    southampton|   33|
|olym. marseille|   33|
|      liverpool|   33|
|        watford|   33|
| crystal palace|   33|
|    real madrid|   33|
|        fc metz|   33|
|  werder bremen|   33|
|       west ham|   33|
|          spurs|   33|
| sporting gijón|   33|
|    fc augsburg|   33|
|    bournemouth|   33|
|  ud las palmas|   33|
|       ogc nice|   33|
|     sunderland|   33|
+---------------+-----+
only showing top 20 rows



In [None]:
spark_df.select(F.col('Nationality')).distinct().count(),
spark_df.select(F.col('Nationality')).distinct().count(),

160