# Instalar librerías

In [1]:
# Instalar librerias
!pip install pyspark
!pip install -q kaggle



# Importar librerías

In [7]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import zipfile
import os
import pandas as pd

## Creación de una sesión con Spark

In [8]:
#Iniciar conexión de spark
spark = SparkSession.builder\
        .master("local")\
        .appName("Colabpractica")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

**Preparacion del ambiete y los datos**

In [4]:
# Creamos la carpeta oculta en ambiente de linux sobre colab
!mkdir ~/.kaggle

In [8]:
# Copiar el archivo JSON  a la carpeta oculta que creamos
!cp kaggle.json ~/.kaggle/

cp: cannot stat 'kaggle.json': No such file or directory


In [9]:
# Cambiamos los permisos para permitir lectura de las credenciales
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
# Descargamos archivos indicando el usuario del propietario de los datos en kaggle y el nombre de dataset
! kaggle datasets download arevel/chess-games --force

Dataset URL: https://www.kaggle.com/datasets/arevel/chess-games
License(s): CC0-1.0
Downloading chess-games.zip to /content
 99% 1.43G/1.45G [00:14<00:00, 203MB/s]
100% 1.45G/1.45G [00:14<00:00, 105MB/s]


In [9]:
# Descomprimir el archivo
for file in os.listdir():
    if file.endswith('.zip'):
      zip_ref = zipfile.ZipFile(file, 'r')
      zip_ref.extractall()
      zip_ref.close()

# **Lectura de datos**

# Lectura de datos con Pandas

In [12]:
df_pandas =pd.read_csv('/content/chess_games.csv')

# Lectura datos con PySpark

In [None]:
df = spark.read.csv('/content/chess_games.csv', header=True)

### **Procesamiento de datos**

### Número de filas y columnas

En pandas usabamos este método:

df.shape

In [9]:
print((df.count(), len (df.columns)))

(6256184, 15)


Si queremos ver una muestra de los datos con pandas usamos la función:

df.head()

Para ver la misma muestra en Spark usemos la función:

In [15]:
df.show(10)

+------------------+---------------+---------------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+--------------------+
|             Event|          White|          Black|Result|   UTCDate| UTCTime|WhiteElo|BlackElo|WhiteRatingDiff|BlackRatingDiff|ECO|             Opening|TimeControl| Termination|                  AN|
+------------------+---------------+---------------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+--------------------+
|        Classical |        eisaaaa|       HAMID449|   1-0|2016.06.30|22:00:01|    1901|    1896|           11.0|          -11.0|D10|        Slav Defense|      300+5|Time forfeit|1. d4 d5 2. c4 c6...|
|            Blitz |         go4jas|     Sergei1973|   0-1|2016.06.30|22:00:01|    1641|    1627|          -11.0|           12.0|C20|King's Pawn Openi...|      300+0|      Normal|1. e4 e5 2. b3 Nf

##### El Esquema de los datos lo vemos en pandas asi:

df.info()

Ahora si lo vemos en spark hacemos lo siguiente:

In [11]:
df.printSchema()

root
 |-- Event: string (nullable = true)
 |-- White: string (nullable = true)
 |-- Black: string (nullable = true)
 |-- Result: string (nullable = true)
 |-- UTCDate: string (nullable = true)
 |-- UTCTime: string (nullable = true)
 |-- WhiteElo: string (nullable = true)
 |-- BlackElo: string (nullable = true)
 |-- WhiteRatingDiff: string (nullable = true)
 |-- BlackRatingDiff: string (nullable = true)
 |-- ECO: string (nullable = true)
 |-- Opening: string (nullable = true)
 |-- TimeControl: string (nullable = true)
 |-- Termination: string (nullable = true)
 |-- AN: string (nullable = true)



Estadística descriptiva
pandas:

df.describe()

In [12]:
df.describe().show()

+-------+--------------------+--------------------+--------------------+-------+----------+--------+------------------+-----------------+------------------+--------------------+-------+--------------------+-----------+------------+--------------------+
|summary|               Event|               White|               Black| Result|   UTCDate| UTCTime|          WhiteElo|         BlackElo|   WhiteRatingDiff|     BlackRatingDiff|    ECO|             Opening|TimeControl| Termination|                  AN|
+-------+--------------------+--------------------+--------------------+-------+----------+--------+------------------+-----------------+------------------+--------------------+-------+--------------------+-----------+------------+--------------------+
|  count|             6256184|             6256184|             6256184|6256184|   6256184| 6256184|           6256184|          6256184|           6251516|             6251516|6256184|             6256184|    6256184|     6256184|          

**Filtrar el dataset**

In [21]:
df.filter(df.Event == "Classical ").show()

+----------+---------------+----------------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+--------------------+
|     Event|          White|           Black|Result|   UTCDate| UTCTime|WhiteElo|BlackElo|WhiteRatingDiff|BlackRatingDiff|ECO|             Opening|TimeControl| Termination|                  AN|
+----------+---------------+----------------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+--------------------+
|Classical |          Rokki|     solibalsara|   0-1|2016.07.02|23:16:20|    1868|    2123|           -4.0|            5.0|D08|Queen's Gambit Re...|     780+12|Time forfeit|1. d4 d5 2. c4 e5...|
|Classical |  DieAffeninsel|          AtitDJ|   0-1|2016.07.05|11:40:16|    1415|    1610|           -5.0|            6.0|A07|King's Indian Attack|     1200+0|      Normal|1. Nf3 d5 2. g3 N...|
|Classical |         Firhad|  

##### **`Ordenar el dataset`**

In [13]:
df.sort(F.col("UTCDate").desc()).show(10)

+--------------------+-------------+-------------------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+--------------------+
|               Event|        White|              Black|Result|   UTCDate| UTCTime|WhiteElo|BlackElo|WhiteRatingDiff|BlackRatingDiff|ECO|             Opening|TimeControl| Termination|                  AN|
+--------------------+-------------+-------------------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+--------------------+
| Classical tourna...|   aeapmestre|Carrascodamadrugada|   1-0|2016.07.31|00:00:03|    2108|    2067|           11.0|          -26.0|A01|Nimzo-Larsen Atta...|      600+0|      Normal|1. b3 d5 2. Bb2 N...|
|              Blitz |        jfa41|              NoJob|   0-1|2016.07.31|12:02:13|    1738|    1894|           -7.0|            7.0|D94|Gruenfeld Defense...|      300+3|      Norm

**Eliminar una columna:**

In [16]:
df = df.drop("AN")
df.show(10)

+------------------+---------------+---------------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+
|             Event|          White|          Black|Result|   UTCDate| UTCTime|WhiteElo|BlackElo|WhiteRatingDiff|BlackRatingDiff|ECO|             Opening|TimeControl| Termination|
+------------------+---------------+---------------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+
|        Classical |        eisaaaa|       HAMID449|   1-0|2016.06.30|22:00:01|    1901|    1896|           11.0|          -11.0|D10|        Slav Defense|      300+5|Time forfeit|
|            Blitz |         go4jas|     Sergei1973|   0-1|2016.06.30|22:00:01|    1641|    1627|          -11.0|           12.0|C20|King's Pawn Openi...|      300+0|      Normal|
| Blitz tournament |Evangelistaizac|         kafune|   1-0|2016.06.30|22:00:02|    1647|    1688|   

**Eliminemos los datos nulos:**

In [17]:
df = df.na.drop()
df.show()

+------------------+---------------+---------------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+
|             Event|          White|          Black|Result|   UTCDate| UTCTime|WhiteElo|BlackElo|WhiteRatingDiff|BlackRatingDiff|ECO|             Opening|TimeControl| Termination|
+------------------+---------------+---------------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+
|        Classical |        eisaaaa|       HAMID449|   1-0|2016.06.30|22:00:01|    1901|    1896|           11.0|          -11.0|D10|        Slav Defense|      300+5|Time forfeit|
|            Blitz |         go4jas|     Sergei1973|   0-1|2016.06.30|22:00:01|    1641|    1627|          -11.0|           12.0|C20|King's Pawn Openi...|      300+0|      Normal|
| Blitz tournament |Evangelistaizac|         kafune|   1-0|2016.06.30|22:00:02|    1647|    1688|   

**Agrupar los datos:**

In [18]:
df_group = df.groupBy('Event').count()
df_group.show()

+--------------------+-------+
|               Event|  count|
+--------------------+-------+
|          Classical |1509068|
|  Bullet tournament | 546617|
|   Blitz tournament | 472071|
| Classical tourna...| 165506|
|             Bullet |1197654|
|             Bullet |      2|
|     Correspondence |  22211|
|              Blitz |2338364|
|          Classical |      7|
|              Blitz |     12|
|   Blitz tournament |      1|
|  Bullet tournament |      2|
|Classical tournam...|      1|
+--------------------+-------+



**Renombrar una columna:**

In [19]:
df = df.withColumnRenamed("Opening", "Opening Type")

In [20]:
df.show(3)

+------------------+---------------+----------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+
|             Event|          White|     Black|Result|   UTCDate| UTCTime|WhiteElo|BlackElo|WhiteRatingDiff|BlackRatingDiff|ECO|        Opening Type|TimeControl| Termination|
+------------------+---------------+----------+------+----------+--------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+
|        Classical |        eisaaaa|  HAMID449|   1-0|2016.06.30|22:00:01|    1901|    1896|           11.0|          -11.0|D10|        Slav Defense|      300+5|Time forfeit|
|            Blitz |         go4jas|Sergei1973|   0-1|2016.06.30|22:00:01|    1641|    1627|          -11.0|           12.0|C20|King's Pawn Openi...|      300+0|      Normal|
| Blitz tournament |Evangelistaizac|    kafune|   1-0|2016.06.30|22:00:02|    1647|    1688|           13.0|          -13.0|B

In [21]:
from pyspark.sql import functions as F  # Assuming you've imported it earlier

df.groupBy("Event").agg(
    F.count("WhiteElo").alias("count"),
    F.sum("WhiteElo").alias("sum"),
    F.max("WhiteElo").alias("max"),
    F.min("WhiteElo").alias("min"),
    F.avg("WhiteElo").alias("avg")
).show(10)

+--------------------+-------+-------------+----+----+------------------+
|               Event|  count|          sum| max| min|               avg|
+--------------------+-------+-------------+----+----+------------------+
|              Blitz |2338364|4.035988081E9| 999|1000|1725.9879475564967|
|   Blitz tournament | 472071| 8.50283189E8| 999|1000|1801.1764946374592|
|             Bullet |1197654|2.112519592E9| 999|1000| 1763.881381434037|
|  Bullet tournament | 546617|1.013198446E9| 999|1000|1853.5801960056126|
|          Classical |1509068|2.545148201E9| 999|1000|1686.5695919600707|
| Classical tourna...| 165506|   2.959576E8| 995|1000| 1788.198615155946|
|     Correspondence |  22211|  3.6530627E7|2510|1011| 1644.708792940435|
|              Blitz |     12|      19904.0|2171|1391|1658.6666666666667|
|   Blitz tournament |      1|       1939.0|1939|1939|            1939.0|
|             Bullet |      2|       4277.0|2589|1688|            2138.5|
+--------------------+-------+--------