## Limpeza de dados Cartola ano 2020

In [1]:
from pyspark.sql import HiveContext
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.functions import lower, col, lit, regexp_replace, trim, substring, when, expr, udf, count, sum, monotonically_increasing_id
import pandas as pd
import json
import requests

# Confirguração para não sobrescrever DF
spark.conf.set('spark.sql.sources.partitionOverwriteMode', 'dynamic')

## Funções Customizadas

In [2]:
def remove_after_hiphen(team_name):
    if team_name.startswith('atletico') or team_name.startswith('atl')  or team_name.startswith('Atl') or team_name.startswith('Ath'):
        return team_name
    else:
        return team_name.split('-', 1)[0]

In [3]:
#Remove dos valores das Strings o que estiver após os hiphen
remove_hiphen_udf = udf(remove_after_hiphen, StringType())
#partidas_2014_ct = partidas_2014_df.withColumn('away_team', remove_hiphen_udf(partidas_2014_df['away_team']))

## Jogadores

In [4]:
#Carrega arquivo CSV Jogadores
jogadores_2020_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2020/2020-medias-jogadores.csv", header=True)
sorted_jogadores_2020_df = jogadores_2020_df.sort(jogadores_2020_df.player_id.asc())


In [5]:
#Carrega arquivo Posicoes_ID
posicoes_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/posicoes_ids.csv", header=True)
posicoes_df.toPandas()


Unnamed: 0,_c0,Cod,Position,abbr
0,1,1,Goleiro,gol
1,2,2,Lateral,lat
2,3,3,Zagueiro,zag
3,4,4,Meia,mei
4,5,5,Atacante,ata
5,6,6,Técnico,tec


In [6]:
#Adiciona coluna ANO = 2020
jogadores_2020_ano = sorted_jogadores_2020_df.withColumn('year', lit(2020))
jogadores_2020_ano.toPandas()

Unnamed: 0,player_slug,player_id,player_nickname,player_team,player_position,price_cartoletas,score_mean,score_no_cleansheets_mean,diff_home_away_s,n_games,...,A_mean,I_mean,FS_mean,FF_mean,G_mean,DD_mean,status,price_diff,last_points,year
0,lincoln,100065,Lincoln,262,ata,4.25,1.4,1.4,0,8,...,0,0,0,0,0.125,0,Nulo,-0.61,6,2020
1,igor-gomes,100084,Igor Gomes,276,mei,3.04,0.792857142857143,0.792857142857143,0,14,...,0.0714285714285714,0,1.07142857142857,0.285714285714286,0,0,Nulo,-0.67,-1.7,2020
2,pedrinho,100103,Pedrinho,293,ata,2.25,0.725,0.725,0,8,...,0,0,0.5,0.375,0,0,Nulo,-0.15,0.4,2020
3,thuler,100125,Thuler,262,zag,2.12,0.0666666666666666,0.0666666666666666,0,3,...,0,0,0.666666666666667,0,0,0,Nulo,-0.25,-0.6,2020
4,lucas-venuto,100290,Lucas Venuto,292,ata,2.16,1.28333333333333,1.28333333333333,0,6,...,0,0,2.33333333333333,0,0,0,Nulo,-0.04,0.4,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,michael,99392,Michael,262,ata,5.43,1.825,1.825,0,8,...,0.125,0,0.25,1.25,0,0,Nulo,-0.48,-1.1,2020
595,fernando,99550,Fernando,263,lat,3.2,1,1,0,2,...,0,0,0,0,0,0,Dúvida,0.65,5.3,2020
596,brenner,99789,Brenner,276,ata,5.05,5.62,5.62,0,10,...,0.1,0.4,1,0.9,0.5,0,Nulo,-0.61,-0.8,2020
597,ricardo,99881,Ricardo,267,zag,7.13,3.7,1.2,0,8,...,0,0,0.5,0.375,0,0,Dúvida,-0.04,-0.1,2020


In [7]:
jogadores_2020_ano = jogadores_2020_ano['player_id', 'player_nickname', 'player_team', 'player_position']
jogadores_2020_ano.toPandas()

Unnamed: 0,player_id,player_nickname,player_team,player_position
0,100065,Lincoln,262,ata
1,100084,Igor Gomes,276,mei
2,100103,Pedrinho,293,ata
3,100125,Thuler,262,zag
4,100290,Lucas Venuto,292,ata
...,...,...,...,...
594,99392,Michael,262,ata
595,99550,Fernando,263,lat
596,99789,Brenner,276,ata
597,99881,Ricardo,267,zag


In [8]:
jogadores_posicoes = jogadores_2020_ano.join(posicoes_df, jogadores_2020_ano.player_position == posicoes_df.abbr)
jogadores_posicoes.toPandas()

Unnamed: 0,player_id,player_nickname,player_team,player_position,_c0,Cod,Position,abbr
0,100065,Lincoln,262,ata,5,5,Atacante,ata
1,100084,Igor Gomes,276,mei,4,4,Meia,mei
2,100103,Pedrinho,293,ata,5,5,Atacante,ata
3,100125,Thuler,262,zag,3,3,Zagueiro,zag
4,100290,Lucas Venuto,292,ata,5,5,Atacante,ata
...,...,...,...,...,...,...,...,...
594,99392,Michael,262,ata,5,5,Atacante,ata
595,99550,Fernando,263,lat,2,2,Lateral,lat
596,99789,Brenner,276,ata,5,5,Atacante,ata
597,99881,Ricardo,267,zag,3,3,Zagueiro,zag


In [9]:
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('player_id', 'ID')
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('player_nickname', 'Apelido')
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('player_team', 'ClubeID')
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('Cod', 'PosicaoID')

In [10]:
# Removendo colunas
jogadores = jogadores_posicoes.drop('_c0', 'player_position', 'Position', 'abbr')

In [11]:
jogadores.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID
0,100065,Lincoln,262,5
1,100084,Igor Gomes,276,4
2,100103,Pedrinho,293,5
3,100125,Thuler,262,3
4,100290,Lucas Venuto,292,5
...,...,...,...,...
594,99392,Michael,262,5
595,99550,Fernando,263,2
596,99789,Brenner,276,5
597,99881,Ricardo,267,3


In [12]:
jogadores_2020_ano = jogadores.withColumn('year', lit(2020))
jogadores_2020_ano.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID,year
0,100065,Lincoln,262,5,2020
1,100084,Igor Gomes,276,4,2020
2,100103,Pedrinho,293,5,2020
3,100125,Thuler,262,3,2020
4,100290,Lucas Venuto,292,5,2020
...,...,...,...,...,...
594,99392,Michael,262,5,2020
595,99550,Fernando,263,2,2020
596,99789,Brenner,276,5,2020
597,99881,Ricardo,267,3,2020


In [13]:
jogadores_2020_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/jogadores/')

In [14]:
#transforma arquivo em parquet
jogadores_2020_parquet = spark.read.option('basePath', '/cartola/clean/jogadores/').parquet('/cartola/clean/jogadores/*')

In [15]:
jogadores_2020_parquet.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID,year
0,51683,Bruno Rangel,315,5,2014
1,51705,Bruno Rodrigo,283,3,2014
2,51772,Éverton Ribeiro,283,4,2014
3,51779,Pedro Botelho,282,2,2014
4,51781,Ávine,265,2,2014
...,...,...,...,...,...
5523,85930,Aylon,327,5,2018
5524,51792,Kanu,287,3,2018
5525,52190,Ralf,264,4,2018
5526,91607,Rony,293,5,2018


In [16]:
print(jogadores_2020_parquet.count())

5528


## Partidas

In [17]:
#Carrega arquivo CSV
partidas_2020_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2020/2020_partidas.csv", header=True)

In [18]:
partidas_2020_df.show(5)

+----------+---------+---------+----------+----------+-----+
|      date|home_team|away_team|home_score|away_score|round|
+----------+---------+---------+----------+----------+-----+
|2020-08-08|      356|      293|         0|         2|    1|
|2020-08-08|      294|      285|         0|         1|    1|
|2020-08-08|      292|      354|         3|         2|    1|
|2020-08-09|      262|      282|         0|         1|    1|
|2020-08-09|      277|      280|         1|         1|    1|
+----------+---------+---------+----------+----------+-----+
only showing top 5 rows



In [19]:
# Criar nome do time com a string antes do Hífen
partidas_2020_ct = partidas_2020_df.withColumn('away_team', remove_hiphen_udf(partidas_2020_df['away_team']))

partidas_2020_ct = partidas_2020_ct.withColumn('home_score', partidas_2020_ct['home_score'].cast(IntegerType()))

partidas_2020_ct = partidas_2020_ct.withColumn('away_score', partidas_2020_ct['away_score'].cast(IntegerType()))

partidas_2020_ct = partidas_2020_ct.withColumn('total_gols', partidas_2020_ct['away_score'] + partidas_2020_ct['home_score'] )

partidas_2020_ct = partidas_2020_ct.withColumn('year', lit(2020))

time_ganhador = expr(
    """IF(home_score > away_score, home_team, IF(home_score = away_score, 'empate', away_team))"""
)

partidas_2020_ct = partidas_2020_ct.withColumn('result', time_ganhador)

partidas_2020_ct.toPandas()

Unnamed: 0,date,home_team,away_team,home_score,away_score,round,total_gols,year,result
0,2020-08-08,356,293,0,2,1,2,2020,293
1,2020-08-08,294,285,0,1,1,1,2020,285
2,2020-08-08,292,354,3,2,1,5,2020,292
3,2020-08-09,262,282,0,1,1,1,2020,282
4,2020-08-09,277,280,1,1,1,2,2020,empate
...,...,...,...,...,...,...,...,...,...
385,2021-02-25,293,292,2,0,38,2,2020,293
386,2021-02-25,354,263,2,1,38,3,2020,354
387,2021-02-25,354,263,2,1,38,3,2020,354
388,2021-02-25,373,294,3,1,38,4,2020,373


In [21]:
partidas_2020_ct.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/partidas/')

In [22]:
#transforma arquivo em parquet
partidas_2020_parquet = spark.read.option('basePath', '/cartola/clean/partidas/').parquet('/cartola/clean/partidas/*')

In [23]:
partidas_2020_parquet.toPandas()


Unnamed: 0,game,round,date,home_team,score,away_team,arena,home_score,away_score,total_gols,result,year
0,1,1,20/04/2014 - 18:30,flamengo,0 x 0,goiás,Mané Garrincha - Brasilia - DF,0.0,0.0,0.0,empate,2014
1,2,1,19/04/2014 - 18:30,fluminense,3 x 0,figueirense,Maracanã - Rio de Janeiro - RJ,3.0,0.0,3.0,fluminense,2014
2,3,1,20/04/2014 - 16:00,são paulo,3 x 0,botafogo,Morumbi - Sao Paulo - SP,3.0,0.0,3.0,são paulo,2014
3,4,1,20/04/2014 - 18:30,santos,1 x 1,sport,Vila Belmiro - Santos - SP,1.0,1.0,2.0,empate,2014
4,5,1,20/04/2014 - 16:00,atletico - pr,1 x 0,grêmio,Orlando Scarpelli - Florianopolis - SC,1.0,0.0,1.0,atletico - pr,2014
...,...,...,...,...,...,...,...,...,...,...,...,...
2665,,38,2019-12-08,290,,284,,3.0,2.0,5.0,290,2019
2666,,38,2019-12-08,285,,282,,2.0,1.0,3.0,285,2019
2667,,38,2019-12-08,283,,275,,0.0,2.0,2.0,275,2019
2668,,38,2019-12-08,277,,262,,4.0,0.0,4.0,277,2019


## Times

In [24]:
times_2020_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2020/rodada-1.csv", header=True)

times_2020_df.schema

StructType(List(StructField(_c0,StringType,true),StructField(atletas.nome,StringType,true),StructField(atletas.slug,StringType,true),StructField(atletas.apelido,StringType,true),StructField(atletas.foto,StringType,true),StructField(atletas.atleta_id,StringType,true),StructField(atletas.rodada_id,StringType,true),StructField(atletas.clube_id,StringType,true),StructField(atletas.posicao_id,StringType,true),StructField(atletas.status_id,StringType,true),StructField(atletas.pontos_num,StringType,true),StructField(atletas.preco_num,StringType,true),StructField(atletas.variacao_num,StringType,true),StructField(atletas.media_num,StringType,true),StructField(atletas.jogos_num,StringType,true),StructField(atletas.clube.id.full.name,StringType,true),StructField(FF,StringType,true),StructField(FS,StringType,true),StructField(G,StringType,true),StructField(PI,StringType,true),StructField(CA,StringType,true),StructField(FC,StringType,true),StructField(DS,StringType,true),StructField(FT,StringType,t

In [25]:
times_2020_df = times_2020_df.withColumnRenamed('atletas.clube_id', 'Abreviacao')
times_2020_df = times_2020_df.withColumnRenamed('atletas.clube.id.full.name', 'Nome')

In [26]:
times_2020_df = times_2020_df['Abreviacao', 'Nome']
times_2020_df = times_2020_df.dropDuplicates(['Abreviacao', 'Nome'])
times_2020_df.toPandas()

Unnamed: 0,Abreviacao,Nome
0,354,Ceará
1,373,Atlético-GO
2,284,Grêmio
3,277,Santos
4,276,São Paulo
5,282,Atlético-MG
6,267,Vasco
7,262,Flamengo
8,285,Internacional
9,263,Botafogo


In [27]:
#Ler tabela de referencia Times_ids
times_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/times_ids.csv", header=True)
times_df = times_df.withColumnRenamed('abreviacao', 'abbr')
times_df.limit(5).toPandas()

Unnamed: 0,nome.cbf,nome.cartola,nome.completo,cod.older,cod.2017,cod.2018,id,abbr,escudos.60x60,escudos.45x45,escudos.30x30
0,América - MG,América-MG,America MG,327,327,327,327,AME,https://s.glbimg.com/es/sde/f/organizacoes/201...,https://s.glbimg.com/es/sde/f/organizacoes/201...,https://s.glbimg.com/es/sde/f/organizacoes/201...
1,America - RN,Atlético-RN,America RN,200,200,1,200,OUT,,,
2,Atlético - GO,Atlético-GO,Atletico GO,201,373,373,373,ATL,,,
3,Atlético - MG,Atlético-MG,Atletico Mineiro,282,282,282,282,ATL,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...
4,Atlético - PR,Atlético-PR,Atletico Paranaense,293,293,293,293,ATL,https://s.glbimg.com/es/sde/f/equipes/2015/06/...,https://s.glbimg.com/es/sde/f/equipes/2015/06/...,https://s.glbimg.com/es/sde/f/equipes/2015/06/...


In [28]:
times_df = times_df.withColumnRenamed('nome.cartola', 'ncartola')

In [29]:
times = times_2020_df.join(times_df, times_2020_df.Nome == times_df.ncartola)

times.toPandas()

Unnamed: 0,Abreviacao,Nome,nome.cbf,ncartola,nome.completo,cod.older,cod.2017,cod.2018,id,abbr,escudos.60x60,escudos.45x45,escudos.30x30
0,354,Ceará,Ceará - CE,Ceará,Ceara SC,204,204,354,354,CEA,https://s.glbimg.com/es/sde/f/equipes/2018/05/...,https://s.glbimg.com/es/sde/f/equipes/2018/05/...,https://s.glbimg.com/es/sde/f/equipes/2018/05/...
1,373,Atlético-GO,Atlético - GO,Atlético-GO,Atletico GO,201,373,373,373,ATL,,,
2,284,Grêmio,Grêmio - RS,Grêmio,Gremio Porto Alegre,284,284,284,284,GRE,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...
3,277,Santos,Santos - SP,Santos,Santos FC,277,277,277,277,SAN,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...
4,276,São Paulo,São Paulo - SP,São Paulo,Sao Paulo FC,276,276,276,276,SAO,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...
5,282,Atlético-MG,Atlético - MG,Atlético-MG,Atletico Mineiro,282,282,282,282,ATL,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...
6,267,Vasco,Vasco da Gama - RJ,Vasco,Vasco da Gama,221,267,267,267,VAS,https://s.glbimg.com/es/sde/f/equipes/2016/07/...,https://s.glbimg.com/es/sde/f/equipes/2016/07/...,https://s.glbimg.com/es/sde/f/equipes/2016/07/...
7,262,Flamengo,Flamengo - RJ,Flamengo,Flamengo RJ,262,262,262,262,FLA,https://s.glbimg.com/es/sde/f/equipes/2018/04/...,https://s.glbimg.com/es/sde/f/equipes/2018/04/...,https://s.glbimg.com/es/sde/f/equipes/2018/04/...
8,285,Internacional,Internacional - RS,Internacional,Internacional,285,285,285,285,INT,https://s.glbimg.com/es/sde/f/equipes/2016/05/...,https://s.glbimg.com/es/sde/f/equipes/2016/05/...,https://s.glbimg.com/es/sde/f/equipes/2016/05/...
9,263,Botafogo,Botafogo - RJ,Botafogo,Botafogo RJ,263,263,263,263,BOT,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...


In [30]:
times = times['Abreviacao', 'Nome', 'id']
times.toPandas()

Unnamed: 0,Abreviacao,Nome,id
0,354,Ceará,354
1,373,Atlético-GO,373
2,284,Grêmio,284
3,277,Santos,277
4,276,São Paulo,276
5,282,Atlético-MG,282
6,267,Vasco,267
7,262,Flamengo,262
8,285,Internacional,285
9,263,Botafogo,263


In [31]:
times_2020_df = times.dropDuplicates(['Abreviacao', 'Nome', 'id'])

In [32]:
times_2020_df = times_2020_df.withColumnRenamed('id', 'ID')
times_2020_df.toPandas()

Unnamed: 0,Abreviacao,Nome,ID
0,354,Ceará,354
1,373,Atlético-GO,373
2,284,Grêmio,284
3,277,Santos,277
4,276,São Paulo,276
5,282,Atlético-MG,282
6,267,Vasco,267
7,262,Flamengo,262
8,285,Internacional,285
9,263,Botafogo,263


In [33]:
#Adiciona coluna ANO = 2020
times_2020_df = times_2020_df.withColumn('year', lit(2020))
times_2020_df.toPandas()

Unnamed: 0,Abreviacao,Nome,ID,year
0,354,Ceará,354,2020
1,373,Atlético-GO,373,2020
2,284,Grêmio,284,2020
3,277,Santos,277,2020
4,276,São Paulo,276,2020
5,282,Atlético-MG,282,2020
6,267,Vasco,267,2020
7,262,Flamengo,262,2020
8,285,Internacional,285,2020
9,263,Botafogo,263,2020


In [34]:
times_2020_df.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/times/')
#transforma arquivo em parquet
times_2020_parquet = spark.read.option('basePath', '/cartola/clean/times/').parquet('/cartola/clean/times/*')

In [35]:
times_2020_parquet.toPandas()

Unnamed: 0,ID,Nome,Abreviacao,Slug,year
0,373,Atlético-GO,ATL,Atlético - GO,2017
1,282,Atlético-MG,ATL,Atlético - MG,2017
2,293,Atlético-PR,ATL,Atlético - PR,2017
3,314,Avaí,AVA,Avaí - SC,2017
4,265,Bahia,BAH,Bahia - BA,2017
...,...,...,...,...,...
133,,Botafogo,263,,2019
134,,Avaí,314,,2019
135,,Vasco,267,,2019
136,,Bahia,265,,2019


## scouts_raw

In [36]:
# Analisando o arquivo 2014_lances.csv vimos que não é necessário processar esse arquivo, pois as informações relevantes estão em scouts_raw.

In [37]:
scouts_raw_2020_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2020/rodada-*.csv", header=True)

In [38]:
scouts_raw_2020_ano = scouts_raw_2020_df.withColumn('year', lit(2020))

In [39]:
scouts_raw_2020_ano.schema

StructType(List(StructField(_c0,StringType,true),StructField(atletas.nome,StringType,true),StructField(atletas.slug,StringType,true),StructField(atletas.apelido,StringType,true),StructField(atletas.foto,StringType,true),StructField(atletas.atleta_id,StringType,true),StructField(atletas.rodada_id,StringType,true),StructField(atletas.clube_id,StringType,true),StructField(atletas.posicao_id,StringType,true),StructField(atletas.status_id,StringType,true),StructField(atletas.pontos_num,StringType,true),StructField(atletas.preco_num,StringType,true),StructField(atletas.variacao_num,StringType,true),StructField(atletas.media_num,StringType,true),StructField(atletas.jogos_num,StringType,true),StructField(atletas.clube.id.full.name,StringType,true),StructField(CA,StringType,true),StructField(DS,StringType,true),StructField(FC,StringType,true),StructField(FD,StringType,true),StructField(FF,StringType,true),StructField(FS,StringType,true),StructField(G,StringType,true),StructField(I,StringType,tr

In [40]:
scouts_raw_2020_ano.toPandas()

Unnamed: 0,_c0,atletas.nome,atletas.slug,atletas.apelido,atletas.foto,atletas.atleta_id,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.status_id,...,A,SG,DD,DP,GS,FT,PP,CV,GC,year
0,1,Augusto Sérgio Ferreira,guto-ferreira,Guto Ferreira,https://s.glbimg.com/es/sde/f/2020/07/19/37fe3...,37245,25,354,tec,Provável,...,,,,,,,,,,2020
1,2,Rafael Martiniano de Miranda Moura,rafael-moura,Rafael Moura,https://s.glbimg.com/es/sde/f/2020/08/26/6c384...,37655,25,290,ata,Provável,...,,,,,,,,,,2020
2,3,Cícero Santos,cicero,Cícero,https://s.glbimg.com/es/sde/f/2019/03/22/127a9...,37688,25,263,mei,Nulo,...,,,,,,,,,,2020
3,4,Marcelo Augusto Oliveira Chamusca,marcelo-chamusca,Marcelo Chamusca,https://s.glbimg.com/es/sde/f/2019/04/18/bb15f...,37319,25,356,tec,Provável,...,,,,,,,,,,2020
4,5,Jonathan Cícero Moreira,jonathan,Jonathan,https://s.glbimg.com/es/sde/f/2019/03/30/8b565...,37662,25,293,lat,Nulo,...,1,4,,,,,,,,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29073,805,Abel Carlos da Silva Braga,abel-braga,Abel Braga,https://s.glbimg.com/es/sde/f/2020/11/10/0f92e...,40006,37,285,tec,Provável,...,,,,,,,,,,2020
29074,806,Matheus Sousa Pereira,pereira,Pereira,https://s.glbimg.com/es/sde/f/2020/11/13/9dbc9...,97711,37,373,zag,Nulo,...,,,,,,,,,,2020
29075,807,Julimar Silva Oliveira Júnior,julimar,Julimar,https://s.glbimg.com/es/sde/f/2021/02/20/de210...,102796,37,293,ata,Nulo,...,,,,,,,,,,2020
29076,808,Marino Hinestroza Angulo,marino,Marino,https://s.glbimg.com/es/sde/f/2020/11/11/7f4b5...,110598,37,275,ata,Nulo,...,,,,,,,,,,2020


In [41]:
scouts_raw_2020_ano = scouts_raw_2020_ano.withColumnRenamed('atletas.atleta_id', 'AtletaID')
scouts_raw_2020_ano = scouts_raw_2020_ano.withColumnRenamed('atletas.clube_id', 'ClubeID')
scouts_raw_2020_ano = scouts_raw_2020_ano.withColumnRenamed('atletas.pontos_num', 'Pontos')
scouts_raw_2020_ano = scouts_raw_2020_ano.withColumnRenamed('atletas.preco_num', 'Preco')
scouts_raw_2020_ano = scouts_raw_2020_ano.withColumnRenamed('atletas.variacao_num', 'PrecoVariacao')
scouts_raw_2020_ano = scouts_raw_2020_ano.withColumnRenamed('atletas.preco_num', 'Preco')



In [42]:
scouts_raw_2020_ano = scouts_raw_2020_ano.drop('scout', 'atletas.apelido', 'atletas.clube.id.full.name', 'atletas.foto', 'atletas.jogos_num', 'atletas.nome', 'atletas.posicao_id', 'atletas.status_id',  )

In [43]:
scouts_raw_2020_ano = scouts_raw_2020_ano.withColumn("Pontos", scouts_raw_2020_ano["Pontos"].cast(FloatType()))

In [44]:
scouts_raw_2020_ano.toPandas()

Unnamed: 0,_c0,atletas.slug,AtletaID,atletas.rodada_id,ClubeID,Pontos,Preco,PrecoVariacao,atletas.media_num,CA,...,A,SG,DD,DP,GS,FT,PP,CV,GC,year
0,1,guto-ferreira,37245,25,354,3.83,9.38,-0.52,3.79,,...,,,,,,,,,,2020
1,2,rafael-moura,37655,25,290,-0.90,5.39,-1.31,1.77,5,...,,,,,,,,,,2020
2,3,cicero,37688,25,263,0.00,4.72,0,0.67,,...,,,,,,,,,,2020
3,4,marcelo-chamusca,37319,25,356,2.23,7.13,-0.46,2.79,,...,,,,,,,,,,2020
4,5,jonathan,37662,25,293,0.00,9.64,0,3.07,2,...,1,4,,,,,,,,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29073,805,abel-braga,40006,37,285,0.00,11.22,-0.24,3.73,,...,,,,,,,,,,2020
29074,806,pereira,97711,37,373,0.00,4.65,0,3.44,,...,,,,,,,,,,2020
29075,807,julimar,102796,37,293,0.00,1,0,0,,...,,,,,,,,,,2020
29076,808,marino,110598,37,275,0.00,1,0,0,,...,,,,,,,,,,2020


In [45]:
scouts_raw_2020_ano.schema

StructType(List(StructField(_c0,StringType,true),StructField(atletas.slug,StringType,true),StructField(AtletaID,StringType,true),StructField(atletas.rodada_id,StringType,true),StructField(ClubeID,StringType,true),StructField(Pontos,FloatType,true),StructField(Preco,StringType,true),StructField(PrecoVariacao,StringType,true),StructField(atletas.media_num,StringType,true),StructField(CA,StringType,true),StructField(DS,StringType,true),StructField(FC,StringType,true),StructField(FD,StringType,true),StructField(FF,StringType,true),StructField(FS,StringType,true),StructField(G,StringType,true),StructField(I,StringType,true),StructField(PI,StringType,true),StructField(A,StringType,true),StructField(SG,StringType,true),StructField(DD,StringType,true),StructField(DP,StringType,true),StructField(GS,StringType,true),StructField(FT,StringType,true),StructField(PP,StringType,true),StructField(CV,StringType,true),StructField(GC,StringType,true),StructField(year,IntegerType,false)))

In [46]:
scouts_raw_2020_ano = scouts_raw_2020_ano.withColumnRenamed('atletas.rodada_id', 'Rodada')
scouts_raw_2020_ano = scouts_raw_2020_ano.withColumnRenamed('atletas.media_num', 'PontosMedia')



In [47]:
scouts_raw_2020_ano = scouts_raw_2020_ano.drop('_c0', 'atletas.slug', 'atletas.posicao_id', 'atletas.status_id')
scouts_raw_2020_ano.toPandas()

Unnamed: 0,AtletaID,Rodada,ClubeID,Pontos,Preco,PrecoVariacao,PontosMedia,CA,DS,FC,...,A,SG,DD,DP,GS,FT,PP,CV,GC,year
0,37245,25,354,3.83,9.38,-0.52,3.79,,,,...,,,,,,,,,,2020
1,37655,25,290,-0.90,5.39,-1.31,1.77,5,5,30,...,,,,,,,,,,2020
2,37688,25,263,0.00,4.72,0,0.67,,1,,...,,,,,,,,,,2020
3,37319,25,356,2.23,7.13,-0.46,2.79,,,,...,,,,,,,,,,2020
4,37662,25,293,0.00,9.64,0,3.07,2,8,4,...,1,4,,,,,,,,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29073,40006,37,285,0.00,11.22,-0.24,3.73,,,,...,,,,,,,,,,2020
29074,97711,37,373,0.00,4.65,0,3.44,,,,...,,,,,,,,,,2020
29075,102796,37,293,0.00,1,0,0,,,,...,,,,,,,,,,2020
29076,110598,37,275,0.00,1,0,0,,,,...,,,,,,,,,,2020


In [48]:
scouts_raw_2020_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/scouts/')
#transforma arquivo em parquet
scouts_raw_2020_ano = spark.read.option('basePath', '/cartola/clean/scouts/').parquet('/cartola/clean/scouts/*')

In [49]:
scouts_raw_2020_ano.toPandas()

Unnamed: 0,AtletaID,Rodada,ClubeID,Participou,Posicao,Jogos,Pontos,PontosMedia,Preco,PrecoVariacao,...,RB,FC,GC,CA,CV,SG,DD,DP,GS,year
0,36540,0,FLA,,,,0.0,,5,0,...,,,,,,,,,,2017
1,36612,0,PAL,,,,0.0,,8,0,...,,,,,,,,,,2017
2,36943,0,ATL,,,,0.0,,10,0,...,,,,,,,,,,2017
3,37245,0,BAH,,,,0.0,,4,0,...,,,,,,,,,,2017
4,37246,0,SPO,,,,0.0,,4,0,...,,,,,,,,,,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216262,40006,37,285,,,,0.0,3.73,11.22,-0.24,...,,,,,,,,,,2020
216263,97711,37,373,,,,0.0,3.44,4.65,0,...,,,,,,,,,,2020
216264,102796,37,293,,,,0.0,0,1,0,...,,,,,,,,,,2020
216265,110598,37,275,,,,0.0,0,1,0,...,,,,,,,,,,2020


In [50]:
# -        Quantas partidas resultaram em empate?
pontos_por_atleta = scouts_raw_2020_ano[scouts_raw_2020_ano['year'] == 2020]

jogadores_2020 = jogadores_2020_parquet[jogadores_2020_parquet['year'] == 2020]

pontos_por_atleta = pontos_por_atleta.groupBy("AtletaID").agg(sum("Pontos").alias("SomaPontos"))

#count_result = count_result.withColumn('total_wins', count_result['count(1)'])

pontos_por_atleta = pontos_por_atleta.sort(pontos_por_atleta.SomaPontos.desc())

#count_result = count_result.drop('count(1)')

#count_result.show(5)


pontos_por_atleta.toPandas()

Unnamed: 0,AtletaID,SomaPontos
0,68952,254.700000
1,92630,230.000001
2,71844,195.800000
3,71162,193.500001
4,86485,187.200000
...,...,...
1008,106843,-5.800000
1009,71640,-6.600000
1010,72595,-7.800000
1011,52950,-9.200000


In [52]:
scouts_atletas = pontos_por_atleta.join(jogadores_2020, pontos_por_atleta.AtletaID == jogadores_2020.ID)
scouts_atletas = scouts_atletas.sort(scouts_atletas.SomaPontos.desc())
scouts_atletas.toPandas()

Unnamed: 0,AtletaID,SomaPontos,ID,Apelido,ClubeID,PosicaoID,year
0,68952,254.700000,68952,Marinho,277,5,2020
1,92630,230.000001,92630,Claudinho,280,5,2020
2,71844,195.800000,71844,Thiago Galhardo,285,4,2020
3,71162,193.500001,71162,Vinícius,354,4,2020
4,86485,187.200000,86485,Keno,282,5,2020
...,...,...,...,...,...,...,...
594,106843,-5.800000,106843,Borrero,282,4,2020
595,71640,-6.600000,71640,Ronaldo Silva,292,5,2020
596,72595,-7.800000,72595,Maurício Kozlinski,373,1,2020
597,52950,-9.200000,52950,Victor,282,1,2020


## DEMONSTRAÇÃO

In [None]:
partidas_2014_df = spark.read.csv("/cartola/data/2014/2014_jogadores.csv", header=True)
partidas_2014_ct = partidas_2014_df.withColumn('time', regexp_replace('home_team', ' - RJ', ''))
final_partidas = partidas_2014_ct.withColumn('time_low', lower(col('time'))).show(truncate=False)

In [None]:
with_ano_partidas = partidas_2014_ct.withColumn('ano', lit(2014)).show(truncate=False)
with_ano_partidas.show()

In [None]:
Comando para sobrescrever arquivo caso já existente.
Agrupar scouts por ID e contar (caso tenha duplicidade)

In [85]:
inner_join = partidas_ids_2014_ano.join(times_2014_ano, partidas_ids_2014_ano.Casa == times_2014_ano.ID)
inner_join.toPandas()

Unnamed: 0,ID,Rodada,Casa,Visitante,PlacarCasa,PlacarVisitante,Resultado,ano,ID.1,Nome,Abreviacao,Slug,ano.1
0,179872,1,262,290,0,0,Empate,2014,262,flamengo,FLA,flamengo,2014
1,179873,1,266,316,3,0,Casa,2014,266,fluminense,FLU,fluminense,2014
2,179874,1,276,263,3,0,Casa,2014,276,são paulo,SAO,sao-paulo,2014
3,179875,1,277,292,1,1,Empate,2014,277,santos,SAN,santos,2014
4,179876,1,293,284,1,0,Casa,2014,293,atlético-pr,CAP,atletico-pr,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,180250,29,282,315,1,0,Casa,2014,282,atlético-mg,CAM,atletico-mg,2014
376,180251,29,287,283,0,1,Visitante,2014,287,vitória,VIT,vitoria,2014
377,180252,29,285,264,1,2,Visitante,2014,285,internacional,INT,internacional,2014
378,180253,29,316,294,4,0,Casa,2014,316,figueirense,FIG,figueirense,2014


In [None]:
scouts_atletas = pontos_por_atleta.join(jogadores_2014_parquet, pontos_por_atleta.Atleta == jogadores_2014_parquet.ID)
scouts_atletas = scouts_atletas.sort(scouts_atletas.SomaPontos.desc())
scouts_atletas.toPandas()

In [None]:
# -        Quantas partidas resultaram em empate?
pontos_por_atleta = scouts_raw_2014_ano.groupBy("Atleta").agg(sum("Pontos").alias("SomaPontos"))

#count_result = count_result.withColumn('total_wins', count_result['count(1)'])

pontos_por_atleta = pontos_por_atleta.sort(pontos_por_atleta.SomaPontos.desc())

#count_result = count_result.drop('count(1)')

#count_result.show(5)


pontos_por_atleta.toPandas()