In [39]:
import os
import spark
import requests
from os.path import exists
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, max as spmax, mean
from pyspark.sql.types import FloatType, IntegerType

In [2]:
url = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/02_Filtering_%26_Sorting/Euro12/Euro_2012_stats_TEAM.csv'
spark = SparkSession.builder.appName('euro_2012').getOrCreate()

In [3]:
response = requests.get(url)

In [6]:
if response.status_code == 200:
    spliter = url.split('/')
    filename = spliter[len(spliter) - 1]
    if not exists(filename):
        if open(filename, 'wb').write(response.content) == False:
            print('Download failed')
        else:
            print('Download succed')

Download succed


In [13]:
euro12 = spark.read.csv(filename, header=True)

In [15]:
euro12.select(euro12.columns[:6]).show(4)

+--------------+-----+---------------+----------------+-----------------+----------------+
|          Team|Goals|Shots on target|Shots off target|Shooting Accuracy|% Goals-to-shots|
+--------------+-----+---------------+----------------+-----------------+----------------+
|       Croatia|    4|             13|              12|            51.9%|           16.0%|
|Czech Republic|    4|             13|              18|            41.9%|           12.9%|
|       Denmark|    4|             10|              10|            50.0%|           20.0%|
|       England|    5|             11|              18|            50.0%|           17.2%|
+--------------+-----+---------------+----------------+-----------------+----------------+
only showing top 4 rows



In [17]:
euro12.columns

['Team',
 'Goals',
 'Shots on target',
 'Shots off target',
 'Shooting Accuracy',
 '% Goals-to-shots',
 'Total shots (inc. Blocked)',
 'Hit Woodwork',
 'Penalty goals',
 'Penalties not scored',
 'Headed goals',
 'Passes',
 'Passes completed',
 'Passing Accuracy',
 'Touches',
 'Crosses',
 'Dribbles',
 'Corners Taken',
 'Tackles',
 'Clearances',
 'Interceptions',
 'Clearances off line',
 'Clean Sheets',
 'Blocks',
 'Goals conceded',
 'Saves made',
 'Saves-to-shots ratio',
 'Fouls Won',
 'Fouls Conceded',
 'Offsides',
 'Yellow Cards',
 'Red Cards',
 'Subs on',
 'Subs off',
 'Players Used']

In [19]:
euro12.select('Goals').show()

+-----+
|Goals|
+-----+
|    4|
|    4|
|    4|
|    5|
|    3|
|   10|
|    5|
|    6|
|    2|
|    2|
|    6|
|    1|
|    5|
|   12|
|    5|
|    2|
+-----+



In [21]:
euro12.select('Team').distinct().count()

16

In [22]:
discipline = euro12.select("Team", "Yellow Cards", "Red Cards")

In [34]:
discipline = discipline.withColumn("Yellow Cards", col("Yellow Cards").cast(IntegerType()))

In [36]:
discipline = discipline.withColumn("Red Cards", col("Red Cards").cast(IntegerType()))

In [35]:
discipline.sort("Yellow Cards").show()

+-------------------+------------+---------+
|               Team|Yellow Cards|Red Cards|
+-------------------+------------+---------+
|            Denmark|           4|        0|
|            Germany|           4|        0|
|        Netherlands|           5|        0|
|            Ukraine|           5|        0|
|            England|           5|        0|
|             Russia|           6|        0|
|             France|           6|        0|
|Republic of Ireland|           6|        1|
|             Poland|           7|        1|
|             Sweden|           7|        0|
|     Czech Republic|           7|        0|
|            Croatia|           9|        0|
|             Greece|           9|        1|
|              Spain|          11|        0|
|           Portugal|          12|        0|
|              Italy|          16|        0|
+-------------------+------------+---------+



In [37]:
discipline.sort("Red Cards").show()

+-------------------+------------+---------+
|               Team|Yellow Cards|Red Cards|
+-------------------+------------+---------+
|     Czech Republic|           7|        0|
|             France|           6|        0|
|            Germany|           4|        0|
|            Croatia|           9|        0|
|            England|           5|        0|
|           Portugal|          12|        0|
|             Russia|           6|        0|
|              Spain|          11|        0|
|             Sweden|           7|        0|
|            Ukraine|           5|        0|
|              Italy|          16|        0|
|            Denmark|           4|        0|
|        Netherlands|           5|        0|
|             Greece|           9|        1|
|Republic of Ireland|           6|        1|
|             Poland|           7|        1|
+-------------------+------------+---------+



In [24]:
discipline.printSchema()

root
 |-- Team: string (nullable = true)
 |-- Yellow Cards: string (nullable = true)
 |-- Red Cards: string (nullable = true)



In [40]:
discipline.select(mean('Yellow Cards')).show()

+-----------------+
|avg(Yellow Cards)|
+-----------------+
|           7.4375|
+-----------------+



In [41]:
discipline.select(mean('Red Cards')).show()

+--------------+
|avg(Red Cards)|
+--------------+
|        0.1875|
+--------------+



In [45]:
euro12.filter(euro12['Goals'] > 6).select('Team', 'Goals').show()

+-------+-----+
|   Team|Goals|
+-------+-----+
|Germany|   10|
|  Spain|   12|
+-------+-----+



In [48]:
euro12.filter(col('Team').like('G%')).select('Team').show()

+-------+
|   Team|
+-------+
|Germany|
| Greece|
+-------+



In [58]:
euro12.columns

['Team',
 'Goals',
 'Shots on target',
 'Shots off target',
 'Shooting Accuracy',
 '% Goals-to-shots',
 'Total shots (inc. Blocked)',
 'Hit Woodwork',
 'Penalty goals',
 'Penalties not scored',
 'Headed goals',
 'Passes',
 'Passes completed',
 'Passing Accuracy',
 'Touches',
 'Crosses',
 'Dribbles',
 'Corners Taken',
 'Tackles',
 'Clearances',
 'Interceptions',
 'Clearances off line',
 'Clean Sheets',
 'Blocks',
 'Goals conceded',
 'Saves made',
 'Saves-to-shots ratio',
 'Fouls Won',
 'Fouls Conceded',
 'Offsides',
 'Yellow Cards',
 'Red Cards',
 'Subs on',
 'Subs off',
 'Players Used']

In [66]:
n = len(euro12.columns) - 3
euro12.select(euro12.columns[7:-3])

DataFrame[Hit Woodwork: string, Penalty goals: string, Penalties not scored: string, Headed goals: string, Passes: string, Passes completed: string, Passing Accuracy: string, Touches: string, Crosses: string, Dribbles: string, Corners Taken: string, Tackles: string, Clearances: string, Interceptions: string, Clearances off line: string, Clean Sheets: string, Blocks: string, Goals conceded: string, Saves made: string, Saves-to-shots ratio: string, Fouls Won: string, Fouls Conceded: string, Offsides: string, Yellow Cards: string, Red Cards: string]

In [67]:
euro12.filter(col("Team").isin(["England", "Italy", "Russia"])).select("Team", "Shooting Accuracy").show()

+-------+-----------------+
|   Team|Shooting Accuracy|
+-------+-----------------+
|England|            50.0%|
|  Italy|            43.0%|
| Russia|            22.5%|
+-------+-----------------+

