In [1]:
!pip install tqdm

[0m

In [2]:
!hadoop fs -rm -r /fifadata

Deleted /fifadata


In [3]:
!hadoop fs -mkdir /fifadata
!hadoop fs -put *.csv /fifadata

In [4]:
import os 
#import pandas as pd
from tqdm import tqdm
import numpy as np

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import lit, monotonically_increasing_id

In [5]:
!hadoop fs -put postgresql-42.6.0.jar /

put: `/postgresql-42.6.0.jar': File exists


In [6]:
# init a spark session
appName = "Task1&2"
master = "local"

jdbc_driver_path = 'postgresql-42.6.0.jar'

sc = SparkSession.builder.appName(appName).config("spark.jars", jdbc_driver_path).getOrCreate()
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/16 19:31:16 INFO SparkEnv: Registering MapOutputTracker
23/11/16 19:31:16 INFO SparkEnv: Registering BlockManagerMaster
23/11/16 19:31:16 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
23/11/16 19:31:16 INFO SparkEnv: Registering OutputCommitCoordinator


In [7]:
# merge csvs and read data
# fifa data folder should contain all the csv files from Fifa(Kaggle), 2015-2022
# assume that you are working in the same directory as the data folder


# HDFS path to 'fifadata' folder
hdfs_data_path = "hdfs:///fifadata"  # HDFS path

# Read CSV files from HDFS
csv_files = spark.sparkContext.wholeTextFiles(hdfs_data_path + "/*.csv").keys().collect()
print(csv_files)

combined_df = None

for file in tqdm(csv_files):
    year = file.split("players_")[1].split(".csv")[0]
    df = spark.read.csv(file, header=True, inferSchema=True)
    df = df.withColumn("year", lit(int(year)))  # Add 'year' column
    if combined_df is None:
        combined_df = df
    else:
        combined_df = combined_df.union(df)

# Add a unique ID column
combined_df = combined_df.withColumn("id", monotonically_increasing_id())



                                                                                

['hdfs://cluster-14ef-m/fifadata/players_15.csv', 'hdfs://cluster-14ef-m/fifadata/players_16.csv', 'hdfs://cluster-14ef-m/fifadata/players_17.csv', 'hdfs://cluster-14ef-m/fifadata/players_18.csv', 'hdfs://cluster-14ef-m/fifadata/players_19.csv', 'hdfs://cluster-14ef-m/fifadata/players_20.csv', 'hdfs://cluster-14ef-m/fifadata/players_21.csv', 'hdfs://cluster-14ef-m/fifadata/players_22.csv']


100%|██████████| 8/8 [00:19<00:00,  2.38s/it]                                   


In [8]:
combined_df.show(5)

23/11/16 19:31:53 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+---------+--------------------+-----------------+--------------------+----------------+-------+---------+---------+--------+---+-------------------+---------+---------+------------+-------------------+--------------------+------------+-------------+------------------+----------------+-------------------+-------------------------+--------------+----------------+--------------+---------------+--------------------+--------------+---------+-----------+------------------------+-------------+----------------+---------+------------------+--------------------+--------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+-----

## Using pyspark to read table and write to PostgreSQL

In [15]:
db_properties={}
db_properties['username']="postgres"
db_properties['password']="061128"
db_properties['url']= "jdbc:postgresql://34.28.109.113:5432/postgres"
db_properties['table']="fifa"
db_properties['driver']="org.postgresql.Driver"

In [16]:
df.write.format("jdbc")\
.mode("overwrite")\
.option("url", db_properties['url'])\
.option("dbtable", db_properties['table'])\
.option("user", db_properties['username'])\
.option("password", db_properties['password'])\
.option("driver", db_properties['driver'])\
.save()

                                                                                

In [17]:
df_read = sqlContext.read.format("jdbc")\
    .option("url", db_properties['url'])\
    .option("dbtable", db_properties['table'])\
    .option("user", db_properties['username'])\
    .option("password", db_properties['password'])\
    .option("Driver", db_properties['driver'])\
    .load()
    
df_read.show()

                                                                                

+---------+--------------------+--------------+--------------------+----------------+-------+---------+---------+--------+---+-------------------+---------+---------+------------+--------------------+--------------------+------------+-------------+------------------+----------------+-------------------+-------------------------+--------------+-------------------+--------------+---------------+--------------------+--------------+---------+-----------+------------------------+-------------+----------------+---------+------------------+-----------+--------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+-------------

## Task II 
### Question 1
Find X clubs that have the highest number of players with contracts ending in 2023

We find that En Avant de Guingamp have the most players whose contract end in 2023 in FIFA 2022 dataset.

In [18]:
df_read.createOrReplaceTempView("df_view")

In [19]:
sqlWay1 = spark.sql("""
SELECT dv.club_name, COUNT(*) AS player_count
FROM   df_view dv
WHERE  dv.year = 22 AND dv.club_contract_valid_until = 2023
GROUP BY dv.club_name
ORDER BY player_count DESC
LIMIT 10;
                    """)

In [20]:
sqlWay1.show()

+--------------------+------------+
|           club_name|player_count|
+--------------------+------------+
|En Avant de Guingamp|          19|
| Club Atlético Lanús|          17|
|       Lechia Gdańsk|          17|
|            Barnsley|          16|
|        Kasimpaşa SK|          16|
|        Bengaluru FC|          16|
|        FC Barcelona|          15|
|  SV Wehen Wiesbaden|          15|
|          CA Osasuna|          15|
|      Zagłębie Lubin|          15|
+--------------------+------------+



#### In original pyspark

In [21]:
from pyspark.sql import functions as F

def find_top_clubs_expiring_contracts(df, num_clubs):

  top_clubs = (df.filter((F.col('year') == '22') & 
                        (F.col('club_contract_valid_until') == 2023))
               .groupBy('club_team_id', 'club_name')
               .agg(F.count('sofifa_id').alias('num_players'))
               .orderBy(F.desc('num_players'))
               .limit(num_clubs))
               
  top_names = [row.club_name for row in top_clubs.collect()]
  
  return top_clubs, top_names

In [22]:
clubs, names = find_top_clubs_expiring_contracts(combined_df, 10)

                                                                                

In [23]:
print(f"{names} Club with highest number of players with contracts ending in 2023 in FIFA 2022")

['En Avant de Guingamp', 'Lechia Gdańsk', 'Club Atlético Lanús', 'Kasimpaşa SK', 'Bengaluru FC', 'Barnsley', 'FC Barcelona', 'Zagłębie Lubin', 'SV Wehen Wiesbaden', 'KAA Gent'] Club with highest number of players with contracts ending in 2023 in FIFA 2022


In [24]:
clubs.show()

+------------+--------------------+-----------+
|club_team_id|           club_name|num_players|
+------------+--------------------+-----------+
|        62.0|En Avant de Guingamp|         19|
|    110395.0| Club Atlético Lanús|         17|
|    111091.0|       Lechia Gdańsk|         17|
|    111339.0|        Kasimpaşa SK|         16|
|      1932.0|            Barnsley|         16|
|    113302.0|        Bengaluru FC|         16|
|       241.0|        FC Barcelona|         15|
|    110749.0|      Zagłębie Lubin|         15|
|       492.0|  SV Wehen Wiesbaden|         15|
|       674.0|            KAA Gent|         15|
+------------+--------------------+-----------+



### Question 2


#### In original pyspark

In [25]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def find_top_clubs_by_avg_older_players(df, num_clubs):

  window = Window.orderBy(F.desc('avg_count'))
  df = df.repartition(100)
  top_clubs = (df.filter(F.col('age') > 27)
               .groupBy('year', 'club_team_id', 'club_name')
               .agg(F.count('sofifa_id').alias('count'))
               .groupBy('club_team_id', 'club_name')
               .agg(F.avg('count').alias('avg_count'))
               .withColumn('rank', F.rank().over(window))
               .filter(F.col('rank') <= num_clubs)
               .select('club_team_id', 'club_name'))
               
  top_names = [row.club_name for row in top_clubs.collect()]



  return top_clubs, top_names

In [26]:
clubs, names = find_top_clubs_by_avg_older_players(combined_df, 10)

23/11/16 20:20:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:02 WARN WindowLocalExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:05 WARN WindowLocalExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [27]:
clubs.show()

23/11/16 20:20:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:13 WARN WindowLocalExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:18 WARN WindowLocalExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------------+--------------------+
|club_team_id|           club_name|
+------------+--------------------+
|        null|                null|
|    113158.0| Matsumoto Yamaga FC|
|    111032.0|  Dorados de Sinaloa|
|    110955.0| Shanghai Shenhua FC|
|    112961.0|          Qingdao FC|
|    110974.0|Club Deportivo Jo...|
|    114688.0|         Guaireña FC|
|    101006.0|            Altay SK|
|    101014.0|İstanbul Başakşeh...|
|       749.0|      BB Erzurumspor|
|    101108.0|        Club Olimpia|
|    114511.0|      Sport Huancayo|
+------------+--------------------+



23/11/16 20:20:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/16 20:20:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [28]:
print(f"{names} clubs with highest avg players over 27:")

[None, 'Dorados de Sinaloa', 'Matsumoto Yamaga FC', 'Shanghai Shenhua FC', 'Qingdao FC', 'Club Deportivo Jorge Wilstermann', 'Altay SK', 'Guaireña FC', 'İstanbul Başakşehir FK', 'Sport Huancayo', 'BB Erzurumspor', 'Club Olimpia'] clubs with highest avg players over 27:


### Question 3

In [29]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def most_frequent_nation_position(df):

  window = Window.partitionBy('year').orderBy(F.desc('count'))

  top_positions = (df.filter(F.col('nation_position').isNotNull())
                  .groupBy('year', 'nation_position')
                  .agg(F.count('sofifa_id').alias('count'))
                  .withColumn('rank', F.rank().over(window))
                  .filter(F.col('rank') == 1)
                  .select('year', 'nation_position'))

  return top_positions

In [30]:
most_frequent_nation_position(combined_df).show()



+----+---------------+
|year|nation_position|
+----+---------------+
|  15|            SUB|
|  16|            SUB|
|  17|            SUB|
|  18|            SUB|
|  19|            SUB|
|  20|            SUB|
|  21|            SUB|
|  22|            SUB|
+----+---------------+



                                                                                