## Limpeza de dados Cartola ano 2019

In [1]:
from pyspark.sql import HiveContext
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.functions import lower, col, lit, regexp_replace, trim, substring, when, expr, udf, count, sum, monotonically_increasing_id
import pandas as pd
import json
import requests

# Confirguração para não sobrescrever DF
spark.conf.set('spark.sql.sources.partitionOverwriteMode', 'dynamic')

## Funções Custonmizadas

In [2]:
def remove_after_hiphen(team_name):
    if team_name.startswith('atletico') or team_name.startswith('atl')  or team_name.startswith('Atl') or team_name.startswith('Ath'):
        return team_name
    else:
        return team_name.split('-', 1)[0]

In [3]:
#Remove dos valores das Strings o que estiver após os hiphen
remove_hiphen_udf = udf(remove_after_hiphen, StringType())
#partidas_2014_ct = partidas_2014_df.withColumn('away_team', remove_hiphen_udf(partidas_2014_df['away_team']))

## Jogadores

In [4]:
#Carrega arquivo CSV Jogadores
jogadores_2019_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2019/2019-medias-jogadores.csv", header=True)
sorted_jogadores_2019_df = jogadores_2019_df.sort(jogadores_2019_df.player_id.asc())


In [5]:
#Carrega arquivo Posicoes_ID
posicoes_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/posicoes_ids.csv", header=True)
posicoes_df.toPandas()


Unnamed: 0,_c0,Cod,Position,abbr
0,1,1,Goleiro,gol
1,2,2,Lateral,lat
2,3,3,Zagueiro,zag
3,4,4,Meia,mei
4,5,5,Atacante,ata
5,6,6,Técnico,tec


In [6]:
#Adiciona coluna ANO = 2019
jogadores_2019_ano = sorted_jogadores_2019_df.withColumn('year', lit(2019))
jogadores_2019_ano.toPandas()

Unnamed: 0,player_slug,player_id,player_nickname,player_team,player_position,price_cartoletas,score_mean,score_no_cleansheets_mean,diff_home_away_s,n_games,...,I_mean,FS_mean,FF_mean,G_mean,DD_mean,DP_mean,status,price_diff,last_points,year
0,rickson,100002,Rickson,263,mei,1.4,0.433333333333333,0.433333333333333,-1.38283487393904,6,...,0,1.16666666666667,0.166666666666667,0,0,0,Nulo,0.25,-0.3,2019
1,lincoln,100065,Lincoln,262,ata,5.89,4.00909090909091,4.00909090909091,1.8919422869998,11,...,0.0909090909090909,0.727272727272727,0.545454545454545,0.272727272727273,0,0,Nulo,1.32,13.7,2019
2,igor-gomes,100084,Igor Gomes,276,mei,2.47,1.87777777777778,1.87777777777778,0.413571458750421,27,...,0.037037037037037,1.14814814814815,0.296296296296296,0.0740740740740741,0,0,Nulo,-0.42,3.7,2019
3,pedrinho,100103,Pedrinho,293,ata,1.88,1.36666666666667,1.36666666666667,-1.13910991608839,6,...,0.333333333333333,0.333333333333333,0.5,0,0,0,Dúvida,0.98,6,2019
4,thuler,100125,Thuler,262,zag,3.24,1.80769230769231,-0.115384615384615,0.316909312629831,13,...,0,0.307692307692308,0,0,0,0,Nulo,-0.05,3.5,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687,chara,99891,Chará,282,ata,11.57,4.22777777777778,4.22777777777778,0.894885641247105,18,...,0.222222222222222,1.38888888888889,0.666666666666667,0.166666666666667,0,0,Contundido,-0.44,3.6,2019
688,luan-pereira,99900,Luan Pereira,314,mei,2.34,0.9,0.9,-0.689259442920505,17,...,0,0.941176470588235,0.294117647058824,0,0,0,Provável,-0.08,0.2,2019
689,lucas-campos,99903,Lucas Campos,263,ata,1.91,1.45,1.45,0.163710767695131,10,...,0,1.3,0.4,0,0,0,Nulo,0.23,2,2019
690,ezequiel,99915,Ezequiel,283,ata,3.97,1.62857142857143,1.62857142857143,-0.232526928931772,14,...,0.0714285714285714,1,0.214285714285714,0,0,0,Nulo,-0.24,0.4,2019


In [7]:
jogadores_2019_ano = jogadores_2019_ano['player_id', 'player_nickname', 'player_team', 'player_position']
jogadores_2019_ano.toPandas()

Unnamed: 0,player_id,player_nickname,player_team,player_position
0,100002,Rickson,263,mei
1,100065,Lincoln,262,ata
2,100084,Igor Gomes,276,mei
3,100103,Pedrinho,293,ata
4,100125,Thuler,262,zag
...,...,...,...,...
687,99891,Chará,282,ata
688,99900,Luan Pereira,314,mei
689,99903,Lucas Campos,263,ata
690,99915,Ezequiel,283,ata


In [8]:
jogadores_posicoes = jogadores_2019_ano.join(posicoes_df, jogadores_2019_ano.player_position == posicoes_df.abbr)
jogadores_posicoes.toPandas()

Unnamed: 0,player_id,player_nickname,player_team,player_position,_c0,Cod,Position,abbr
0,100002,Rickson,263,mei,4,4,Meia,mei
1,100065,Lincoln,262,ata,5,5,Atacante,ata
2,100084,Igor Gomes,276,mei,4,4,Meia,mei
3,100103,Pedrinho,293,ata,5,5,Atacante,ata
4,100125,Thuler,262,zag,3,3,Zagueiro,zag
...,...,...,...,...,...,...,...,...
687,99891,Chará,282,ata,5,5,Atacante,ata
688,99900,Luan Pereira,314,mei,4,4,Meia,mei
689,99903,Lucas Campos,263,ata,5,5,Atacante,ata
690,99915,Ezequiel,283,ata,5,5,Atacante,ata


In [9]:
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('player_id', 'ID')
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('player_nickname', 'Apelido')
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('player_team', 'ClubeID')
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('Cod', 'PosicaoID')

In [10]:
# Removendo colunas
jogadores = jogadores_posicoes.drop('_c0', 'player_position', 'Position', 'abbr')

In [11]:
jogadores.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID
0,100002,Rickson,263,4
1,100065,Lincoln,262,5
2,100084,Igor Gomes,276,4
3,100103,Pedrinho,293,5
4,100125,Thuler,262,3
...,...,...,...,...
687,99891,Chará,282,5
688,99900,Luan Pereira,314,4
689,99903,Lucas Campos,263,5
690,99915,Ezequiel,283,5


In [12]:
jogadores_2019_ano = jogadores.withColumn('year', lit(2019))
jogadores_2019_ano.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID,year
0,100002,Rickson,263,4,2019
1,100065,Lincoln,262,5,2019
2,100084,Igor Gomes,276,4,2019
3,100103,Pedrinho,293,5,2019
4,100125,Thuler,262,3,2019
...,...,...,...,...,...
687,99891,Chará,282,5,2019
688,99900,Luan Pereira,314,4,2019
689,99903,Lucas Campos,263,5,2019
690,99915,Ezequiel,283,5,2019


In [13]:
jogadores_2019_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/jogadores/')

In [14]:
#transforma arquivo em parquet
jogadores_2019_parquet = spark.read.option('basePath', '/cartola/clean/jogadores/').parquet('/cartola/clean/jogadores/*')

In [15]:
jogadores_2019_parquet.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID,year
0,51683,Bruno Rangel,315,5,2014
1,51705,Bruno Rodrigo,283,3,2014
2,51772,Éverton Ribeiro,283,4,2014
3,51779,Pedro Botelho,282,2,2014
4,51781,Ávine,265,2,2014
...,...,...,...,...,...
4924,85930,Aylon,327,5,2018
4925,51792,Kanu,287,3,2018
4926,52190,Ralf,264,4,2018
4927,91607,Rony,293,5,2018


In [17]:
print(jogadores_2019_parquet.count())

4929


## Partidas

In [19]:
#Carrega arquivo CSV
partidas_2019_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2019/2019_partidas.csv", header=True)

In [20]:
partidas_2019_df.show(5)

+----------+---------+---------+----------+----------+-----+
|      date|home_team|away_team|home_score|away_score|round|
+----------+---------+---------+----------+----------+-----+
|2019-04-27|      282|      314|         2|         1|    1|
|2019-04-27|      315|      285|         2|         0|    1|
|2019-04-27|      262|      283|         3|         1|    1|
|2019-04-27|      276|      263|         2|         0|    1|
|2019-04-28|      293|      267|         4|         1|    1|
+----------+---------+---------+----------+----------+-----+
only showing top 5 rows



In [21]:
# Criar nome do time com a string antes do Hífen
partidas_2019_ct = partidas_2019_df.withColumn('away_team', remove_hiphen_udf(partidas_2019_df['away_team']))

partidas_2019_ct = partidas_2019_ct.withColumn('home_score', partidas_2019_ct['home_score'].cast(IntegerType()))

partidas_2019_ct = partidas_2019_ct.withColumn('away_score', partidas_2019_ct['away_score'].cast(IntegerType()))

partidas_2019_ct = partidas_2019_ct.withColumn('total_gols', partidas_2019_ct['away_score'] + partidas_2019_ct['home_score'] )

partidas_2019_ct = partidas_2019_ct.withColumn('year', lit(2019))

time_ganhador = expr(
    """IF(home_score > away_score, home_team, IF(home_score = away_score, 'empate', away_team))"""
)

partidas_2019_ct = partidas_2019_ct.withColumn('result', time_ganhador)

partidas_2019_ct.toPandas()

Unnamed: 0,date,home_team,away_team,home_score,away_score,round,total_gols,year,result
0,2019-04-27,282,314,2,1,1,3,2019,282
1,2019-04-27,315,285,2,0,1,2,2019,315
2,2019-04-27,262,283,3,1,1,4,2019,262
3,2019-04-27,276,263,2,0,1,2,2019,276
4,2019-04-28,293,267,4,1,1,5,2019,293
...,...,...,...,...,...,...,...,...,...
375,2019-12-08,290,284,3,2,38,5,2019,290
376,2019-12-08,285,282,2,1,38,3,2019,285
377,2019-12-08,283,275,0,2,38,2,2019,275
378,2019-12-08,277,262,4,0,38,4,2019,277


In [22]:
partidas_2019_ct.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/partidas/')

In [23]:
#transforma arquivo em parquet
partidas_2019_parquet = spark.read.option('basePath', '/cartola/clean/partidas/').parquet('/cartola/clean/partidas/*')

In [24]:
partidas_2019_parquet.toPandas()


Unnamed: 0,game,round,date,home_team,score,away_team,arena,home_score,away_score,total_gols,result,year
0,1,1,20/04/2014 - 18:30,flamengo,0 x 0,goiás,Mané Garrincha - Brasilia - DF,0.0,0.0,0.0,empate,2014
1,2,1,19/04/2014 - 18:30,fluminense,3 x 0,figueirense,Maracanã - Rio de Janeiro - RJ,3.0,0.0,3.0,fluminense,2014
2,3,1,20/04/2014 - 16:00,são paulo,3 x 0,botafogo,Morumbi - Sao Paulo - SP,3.0,0.0,3.0,são paulo,2014
3,4,1,20/04/2014 - 18:30,santos,1 x 1,sport,Vila Belmiro - Santos - SP,1.0,1.0,2.0,empate,2014
4,5,1,20/04/2014 - 16:00,atletico - pr,1 x 0,grêmio,Orlando Scarpelli - Florianopolis - SC,1.0,0.0,1.0,atletico - pr,2014
...,...,...,...,...,...,...,...,...,...,...,...,...
2275,,38,2019-12-08,290,,284,,3.0,2.0,5.0,290,2019
2276,,38,2019-12-08,285,,282,,2.0,1.0,3.0,285,2019
2277,,38,2019-12-08,283,,275,,0.0,2.0,2.0,275,2019
2278,,38,2019-12-08,277,,262,,4.0,0.0,4.0,277,2019


## Times

In [25]:
times_2019_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2019/rodada-1.csv", header=True)

times_2019_df.schema

StructType(List(StructField(_c0,StringType,true),StructField(atletas.nome,StringType,true),StructField(atletas.slug,StringType,true),StructField(atletas.apelido,StringType,true),StructField(atletas.foto,StringType,true),StructField(atletas.atleta_id,StringType,true),StructField(atletas.rodada_id,StringType,true),StructField(atletas.clube_id,StringType,true),StructField(atletas.posicao_id,StringType,true),StructField(atletas.status_id,StringType,true),StructField(atletas.pontos_num,StringType,true),StructField(atletas.preco_num,StringType,true),StructField(atletas.variacao_num,StringType,true),StructField(atletas.media_num,StringType,true),StructField(atletas.clube.id.full.name,StringType,true),StructField(FS,StringType,true),StructField(RB,StringType,true),StructField(PE,StringType,true),StructField(FC,StringType,true),StructField(G,StringType,true),StructField(FF,StringType,true),StructField(FT,StringType,true),StructField(FD,StringType,true),StructField(DD,StringType,true),StructFiel

In [26]:
times_2019_df = times_2019_df.withColumnRenamed('atletas.clube_id', 'Abreviacao')
times_2019_df = times_2019_df.withColumnRenamed('atletas.clube.id.full.name', 'Nome')

In [27]:
times_2019_df = times_2019_df['Abreviacao', 'Nome']
times_2019_df = times_2019_df.dropDuplicates(['Abreviacao', 'Nome'])
times_2019_df.toPandas()

Unnamed: 0,Abreviacao,Nome
0,354,Ceará
1,314,Avaí
2,315,Chapecoense
3,284,Grêmio
4,283,Cruzeiro
5,277,Santos
6,276,São Paulo
7,282,Atlético-MG
8,267,Vasco
9,262,Flamengo


In [28]:
#Ler tabela de referencia Times_ids
times_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/times_ids.csv", header=True)
times_df = times_df.withColumnRenamed('abreviacao', 'abbr')
times_df.limit(5).toPandas()

Unnamed: 0,nome.cbf,nome.cartola,nome.completo,cod.older,cod.2017,cod.2018,id,abbr,escudos.60x60,escudos.45x45,escudos.30x30
0,América - MG,América-MG,America MG,327,327,327,327,AME,https://s.glbimg.com/es/sde/f/organizacoes/201...,https://s.glbimg.com/es/sde/f/organizacoes/201...,https://s.glbimg.com/es/sde/f/organizacoes/201...
1,America - RN,Atlético-RN,America RN,200,200,1,200,OUT,,,
2,Atlético - GO,Atlético-GO,Atletico GO,201,373,373,373,ATL,,,
3,Atlético - MG,Atlético-MG,Atletico Mineiro,282,282,282,282,ATL,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...
4,Atlético - PR,Atlético-PR,Atletico Paranaense,293,293,293,293,ATL,https://s.glbimg.com/es/sde/f/equipes/2015/06/...,https://s.glbimg.com/es/sde/f/equipes/2015/06/...,https://s.glbimg.com/es/sde/f/equipes/2015/06/...


In [29]:
times_df = times_df.withColumnRenamed('nome.cartola', 'ncartola')

In [30]:
times = times_2019_df.join(times_df, times_2019_df.Nome == times_df.ncartola)

times.toPandas()

Unnamed: 0,Abreviacao,Nome,nome.cbf,ncartola,nome.completo,cod.older,cod.2017,cod.2018,id,abbr,escudos.60x60,escudos.45x45,escudos.30x30
0,354,Ceará,Ceará - CE,Ceará,Ceara SC,204,204,354,354,CEA,https://s.glbimg.com/es/sde/f/equipes/2018/05/...,https://s.glbimg.com/es/sde/f/equipes/2018/05/...,https://s.glbimg.com/es/sde/f/equipes/2018/05/...
1,314,Avaí,Avaí - SC,Avaí,Avai FC,202,314,314,314,AVA,,,
2,315,Chapecoense,Chapecoense - SC,Chapecoense,Chapecoense,315,315,315,315,CHA,https://s.glbimg.com/es/sde/f/equipes/2015/08/...,https://s.glbimg.com/es/sde/f/equipes/2015/08/...,https://s.glbimg.com/es/sde/f/equipes/2015/08/...
3,284,Grêmio,Grêmio - RS,Grêmio,Gremio Porto Alegre,284,284,284,284,GRE,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...
4,283,Cruzeiro,Cruzeiro - MG,Cruzeiro,Cruzeiro EC,283,283,283,283,CRU,https://s.glbimg.com/es/sde/f/equipes/2015/04/...,https://s.glbimg.com/es/sde/f/equipes/2015/04/...,https://s.glbimg.com/es/sde/f/equipes/2015/04/...
5,277,Santos,Santos - SP,Santos,Santos FC,277,277,277,277,SAN,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...
6,276,São Paulo,São Paulo - SP,São Paulo,Sao Paulo FC,276,276,276,276,SAO,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...
7,282,Atlético-MG,Atlético - MG,Atlético-MG,Atletico Mineiro,282,282,282,282,ATL,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...
8,267,Vasco,Vasco da Gama - RJ,Vasco,Vasco da Gama,221,267,267,267,VAS,https://s.glbimg.com/es/sde/f/equipes/2016/07/...,https://s.glbimg.com/es/sde/f/equipes/2016/07/...,https://s.glbimg.com/es/sde/f/equipes/2016/07/...
9,262,Flamengo,Flamengo - RJ,Flamengo,Flamengo RJ,262,262,262,262,FLA,https://s.glbimg.com/es/sde/f/equipes/2018/04/...,https://s.glbimg.com/es/sde/f/equipes/2018/04/...,https://s.glbimg.com/es/sde/f/equipes/2018/04/...


In [31]:
times = times['Abreviacao', 'Nome', 'id']
times.toPandas()

Unnamed: 0,Abreviacao,Nome,id
0,354,Ceará,354
1,314,Avaí,314
2,315,Chapecoense,315
3,284,Grêmio,284
4,283,Cruzeiro,283
5,277,Santos,277
6,276,São Paulo,276
7,282,Atlético-MG,282
8,267,Vasco,267
9,262,Flamengo,262


In [61]:
times_2019_df = times.dropDuplicates(['Abreviacao', 'Nome', 'id'])

In [32]:
times_2019_df = times_2019_df.withColumnRenamed('id', 'ID')
times_2019_df.toPandas()

Unnamed: 0,Abreviacao,Nome
0,354,Ceará
1,314,Avaí
2,315,Chapecoense
3,284,Grêmio
4,283,Cruzeiro
5,277,Santos
6,276,São Paulo
7,282,Atlético-MG
8,267,Vasco
9,262,Flamengo


In [33]:
#Adiciona coluna ANO = 2019
times_2019_df = times_2019_df.withColumn('year', lit(2019))
times_2019_df.toPandas()

Unnamed: 0,Abreviacao,Nome,year
0,354,Ceará,2019
1,314,Avaí,2019
2,315,Chapecoense,2019
3,284,Grêmio,2019
4,283,Cruzeiro,2019
5,277,Santos,2019
6,276,São Paulo,2019
7,282,Atlético-MG,2019
8,267,Vasco,2019
9,262,Flamengo,2019


In [34]:
times_2019_df.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/times/')
#transforma arquivo em parquet
times_2019_parquet = spark.read.option('basePath', '/cartola/clean/times/').parquet('/cartola/clean/times/*')

In [35]:
times_2019_parquet.toPandas()

Unnamed: 0,ID,Nome,Abreviacao,Slug,year
0,373,Atlético-GO,ATL,Atlético - GO,2017
1,282,Atlético-MG,ATL,Atlético - MG,2017
2,293,Atlético-PR,ATL,Atlético - PR,2017
3,314,Avaí,AVA,Avaí - SC,2017
4,265,Bahia,BAH,Bahia - BA,2017
...,...,...,...,...,...
114,,Botafogo,263,,2019
115,,Avaí,314,,2019
116,,Vasco,267,,2019
117,,Bahia,265,,2019


## scouts_raw

In [66]:
# Analisando o arquivo 2014_lances.csv vimos que não é necessário processar esse arquivo, pois as informações relevantes estão em scouts_raw.

In [36]:
scouts_raw_2019_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2019/rodada-*.csv", header=True)

In [38]:
scouts_raw_2019_ano = scouts_raw_2019_df.withColumn('year', lit(2019))

In [39]:
scouts_raw_2019_ano.schema

StructType(List(StructField(_c0,StringType,true),StructField(atletas.nome,StringType,true),StructField(atletas.slug,StringType,true),StructField(atletas.apelido,StringType,true),StructField(atletas.foto,StringType,true),StructField(atletas.atleta_id,StringType,true),StructField(atletas.rodada_id,StringType,true),StructField(atletas.clube_id,StringType,true),StructField(atletas.posicao_id,StringType,true),StructField(atletas.status_id,StringType,true),StructField(atletas.pontos_num,StringType,true),StructField(atletas.preco_num,StringType,true),StructField(atletas.variacao_num,StringType,true),StructField(atletas.media_num,StringType,true),StructField(atletas.clube.id.full.name,StringType,true),StructField(CA,StringType,true),StructField(CV,StringType,true),StructField(FC,StringType,true),StructField(FD,StringType,true),StructField(FF,StringType,true),StructField(FS,StringType,true),StructField(PE,StringType,true),StructField(RB,StringType,true),StructField(SG,StringType,true),StructFie

In [40]:
scouts_raw_2019_ano.toPandas()

Unnamed: 0,_c0,atletas.nome,atletas.slug,atletas.apelido,atletas.foto,atletas.atleta_id,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.status_id,...,DD,DP,GS,A,G,I,PP,FT,GC,year
0,1,Ebert William Amâncio,betao,Betão,https://s.glbimg.com/es/sde/f/2019/02/18/c0b80...,37646,38,314,zag,Contundido,...,,,,,,,,,,2019
1,2,Ney Franco da Silveira Júnior,ney-franco,Ney Franco,https://s.glbimg.com/es/sde/f/2019/08/16/4fdb0...,37246,38,290,tec,Provável,...,,,,,,,,,,2019
2,3,Fábio Deivson Lopes Maciel,fabio,Fábio,https://s.glbimg.com/es/sde/f/2018/05/18/d4072...,37656,38,283,gol,Provável,...,44,2,40,,,,,,,2019
3,4,Rafael Martiniano de Miranda Moura,rafael-moura,Rafael Moura,https://s.glbimg.com/es/sde/f/2019/07/16/854eb...,37655,38,290,ata,Provável,...,,,,1,9,7,,,,2019
4,5,Eduardo Luís Abonizio de Souza,edu-dracena,Edu Dracena,https://s.glbimg.com/es/sde/f/2019/06/05/4b2eb...,37657,38,275,zag,Dúvida,...,,,,,,,,,,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30576,734,Nilson Evair Loyola Morales,loyola,Loyola,https://s.glbimg.com/es/sde/f/2019/03/26/c4579...,94680,2,290,lat,Nulo,...,,,,,,,,,,2019
30577,735,Thalles Gabriel Morais dos Reis,thalles,Thalles,https://s.glbimg.com/es/sde/f/2019/03/26/56b3e...,95222,2,290,mei,Nulo,...,,,,,,,,,,2019
30578,736,Marcio Antonio de Sousa Júnior,marcinho,Marcinho,https://s.glbimg.com/es/sde/f/2019/03/26/dd575...,95332,2,290,mei,Nulo,...,,,,,,,,,,2019
30579,737,Hélio Júnio Nunes de Castro,helinho,Helinho,https://s.glbimg.com/es/sde/f/2019/04/01/a4da3...,102598,2,276,ata,Nulo,...,,,,,,,,,,2019


In [41]:
scouts_raw_2019_ano = scouts_raw_2019_ano.withColumnRenamed('atletas.atleta_id', 'AtletaID')
scouts_raw_2019_ano = scouts_raw_2019_ano.withColumnRenamed('atletas.clube_id', 'ClubeID')
scouts_raw_2019_ano = scouts_raw_2019_ano.withColumnRenamed('atletas.pontos_num', 'Pontos')
scouts_raw_2019_ano = scouts_raw_2019_ano.withColumnRenamed('atletas.preco_num', 'Preco')
scouts_raw_2019_ano = scouts_raw_2019_ano.withColumnRenamed('atletas.variacao_num', 'PrecoVariacao')
scouts_raw_2019_ano = scouts_raw_2019_ano.withColumnRenamed('atletas.preco_num', 'Preco')



In [42]:
scouts_raw_2019_ano = scouts_raw_2019_ano.drop('scout', 'atletas.apelido', 'atletas.clube.id.full.name', 'atletas.foto', 'atletas.jogos_num', 'atletas.nome', 'atletas.posicao_id', 'atletas.status_id',  )

In [43]:
scouts_raw_2019_ano = scouts_raw_2019_ano.withColumn("Pontos", scouts_raw_2019_ano["Pontos"].cast(FloatType()))

In [45]:
scouts_raw_2019_ano.toPandas()

Unnamed: 0,_c0,atletas.slug,AtletaID,atletas.rodada_id,ClubeID,Pontos,Preco,PrecoVariacao,atletas.media_num,CA,...,DD,DP,GS,A,G,I,PP,FT,GC,year
0,1,betao,37646,38,314,0.000000,3.97,0,1.91,6,...,,,,,,,,,,2019
1,2,ney-franco,37246,38,290,6.260000,10.17,0.59,3.73,,...,,,,,,,,,,2019
2,3,fabio,37656,38,283,2.000000,10.35,0.42,3.43,2,...,44,2,40,,,,,,,2019
3,4,rafael-moura,37655,38,290,22.200001,7.86,2.89,3.34,4,...,,,,1,9,7,,,,2019
4,5,edu-dracena,37657,38,275,0.000000,5.61,0,3.7,1,...,,,,,,,,,,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30576,734,loyola,94680,2,290,0.000000,2,0,0,,...,,,,,,,,,,2019
30577,735,thalles,95222,2,290,0.000000,2,0,0,,...,,,,,,,,,,2019
30578,736,marcinho,95332,2,290,0.000000,3.05,0,-0.6,,...,,,,,,,,,,2019
30579,737,helinho,102598,2,276,0.000000,6,0,0,,...,,,,,,,,,,2019


In [46]:
scouts_raw_2019_ano.schema

StructType(List(StructField(_c0,StringType,true),StructField(atletas.slug,StringType,true),StructField(AtletaID,StringType,true),StructField(atletas.rodada_id,StringType,true),StructField(ClubeID,StringType,true),StructField(Pontos,FloatType,true),StructField(Preco,StringType,true),StructField(PrecoVariacao,StringType,true),StructField(atletas.media_num,StringType,true),StructField(CA,StringType,true),StructField(CV,StringType,true),StructField(FC,StringType,true),StructField(FD,StringType,true),StructField(FF,StringType,true),StructField(FS,StringType,true),StructField(PE,StringType,true),StructField(RB,StringType,true),StructField(SG,StringType,true),StructField(DD,StringType,true),StructField(DP,StringType,true),StructField(GS,StringType,true),StructField(A,StringType,true),StructField(G,StringType,true),StructField(I,StringType,true),StructField(PP,StringType,true),StructField(FT,StringType,true),StructField(GC,StringType,true),StructField(year,IntegerType,false)))

In [47]:
scouts_raw_2019_ano = scouts_raw_2019_ano.withColumnRenamed('atletas.rodada_id', 'Rodada')
scouts_raw_2019_ano = scouts_raw_2019_ano.withColumnRenamed('atletas.media_num', 'PontosMedia')



In [48]:
scouts_raw_2019_ano = scouts_raw_2019_ano.drop('_c0', 'atletas.slug')
scouts_raw_2019_ano.toPandas()

Unnamed: 0,AtletaID,Rodada,ClubeID,Pontos,Preco,PrecoVariacao,PontosMedia,CA,CV,FC,...,DD,DP,GS,A,G,I,PP,FT,GC,year
0,37646,38,314,0.000000,3.97,0,1.91,6,1,20,...,,,,,,,,,,2019
1,37246,38,290,6.260000,10.17,0.59,3.73,,,,...,,,,,,,,,,2019
2,37656,38,283,2.000000,10.35,0.42,3.43,2,,,...,44,2,40,,,,,,,2019
3,37655,38,290,22.200001,7.86,2.89,3.34,4,1,53,...,,,,1,9,7,,,,2019
4,37657,38,275,0.000000,5.61,0,3.7,1,,4,...,,,,,,,,,,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30576,94680,2,290,0.000000,2,0,0,,,,...,,,,,,,,,,2019
30577,95222,2,290,0.000000,2,0,0,,,,...,,,,,,,,,,2019
30578,95332,2,290,0.000000,3.05,0,-0.6,,,2,...,,,,,,,,,,2019
30579,102598,2,276,0.000000,6,0,0,,,,...,,,,,,,,,,2019


In [49]:
scouts_raw_2019_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/scouts/')
#transforma arquivo em parquet
scouts_raw_2019_ano = spark.read.option('basePath', '/cartola/clean/scouts/').parquet('/cartola/clean/scouts/*')

In [50]:
scouts_raw_2019_ano.toPandas()

Unnamed: 0,AtletaID,Rodada,ClubeID,Participou,Posicao,Jogos,Pontos,PontosMedia,Preco,PrecoVariacao,...,RB,FC,GC,CA,CV,SG,DD,DP,GS,year
0,36540,0,FLA,,,,0.0,,5,0,...,,,,,,,,,,2017
1,36612,0,PAL,,,,0.0,,8,0,...,,,,,,,,,,2017
2,36943,0,ATL,,,,0.0,,10,0,...,,,,,,,,,,2017
3,37245,0,BAH,,,,0.0,,4,0,...,,,,,,,,,,2017
4,37246,0,SPO,,,,0.0,,4,0,...,,,,,,,,,,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187184,94680,2,290,,,,0.0,0,2,0,...,,,,,,,,,,2019
187185,95222,2,290,,,,0.0,0,2,0,...,,,,,,,,,,2019
187186,95332,2,290,,,,0.0,-0.6,3.05,0,...,,2,,,,,,,,2019
187187,102598,2,276,,,,0.0,0,6,0,...,,,,,,,,,,2019


In [52]:
# -        Quantas partidas resultaram em empate?
pontos_por_atleta = scouts_raw_2019_ano[scouts_raw_2019_ano['year'] == 2019]

jogadores_2019 = jogadores_2019_parquet[jogadores_2019_parquet['year'] == 2019]

pontos_por_atleta = pontos_por_atleta.groupBy("AtletaID").agg(sum("Pontos").alias("SomaPontos"))

#count_result = count_result.withColumn('total_wins', count_result['count(1)'])

pontos_por_atleta = pontos_por_atleta.sort(pontos_por_atleta.SomaPontos.desc())

#count_result = count_result.drop('count(1)')

#count_result.show(5)


pontos_por_atleta.toPandas()

Unnamed: 0,AtletaID,SomaPontos
0,83257,269.300001
1,90285,244.900002
2,82453,233.600001
3,87863,229.100004
4,81677,214.100001
...,...,...
1006,102598,-3.400000
1007,63082,-4.000000
1008,50294,-4.200000
1009,99462,-5.100000


In [53]:
scouts_atletas = pontos_por_atleta.join(jogadores_2019, pontos_por_atleta.AtletaID == jogadores_2019.ID)
scouts_atletas = scouts_atletas.sort(scouts_atletas.SomaPontos.desc())
scouts_atletas.toPandas()

Unnamed: 0,AtletaID,SomaPontos,ID,Apelido,ClubeID,PosicaoID,year
0,83257,269.300001,83257,Gabriel,262,5,2019
1,90285,244.900002,90285,Bruno Henrique,262,5,2019
2,82453,233.600001,82453,Tadeu,290,1,2019
3,87863,229.100004,87863,Arrascaeta,262,4,2019
4,81677,214.100001,81677,Carlos Sánchez,277,4,2019
...,...,...,...,...,...,...,...
687,102598,-3.400000,102598,Helinho,276,5,2019
688,63082,-4.000000,63082,Rodolfo,266,1,2019
689,50294,-4.200000,50294,Guilherme,266,4,2019
690,99462,-5.100000,99462,Wesley,314,4,2019


## DEMONSTRAÇÃO

In [None]:
partidas_2014_df = spark.read.csv("/cartola/data/2014/2014_jogadores.csv", header=True)
partidas_2014_ct = partidas_2014_df.withColumn('time', regexp_replace('home_team', ' - RJ', ''))
final_partidas = partidas_2014_ct.withColumn('time_low', lower(col('time'))).show(truncate=False)

In [None]:
with_ano_partidas = partidas_2014_ct.withColumn('ano', lit(2014)).show(truncate=False)
with_ano_partidas.show()

In [None]:
Comando para sobrescrever arquivo caso já existente.
Agrupar scouts por ID e contar (caso tenha duplicidade)

In [85]:
inner_join = partidas_ids_2014_ano.join(times_2014_ano, partidas_ids_2014_ano.Casa == times_2014_ano.ID)
inner_join.toPandas()

Unnamed: 0,ID,Rodada,Casa,Visitante,PlacarCasa,PlacarVisitante,Resultado,ano,ID.1,Nome,Abreviacao,Slug,ano.1
0,179872,1,262,290,0,0,Empate,2014,262,flamengo,FLA,flamengo,2014
1,179873,1,266,316,3,0,Casa,2014,266,fluminense,FLU,fluminense,2014
2,179874,1,276,263,3,0,Casa,2014,276,são paulo,SAO,sao-paulo,2014
3,179875,1,277,292,1,1,Empate,2014,277,santos,SAN,santos,2014
4,179876,1,293,284,1,0,Casa,2014,293,atlético-pr,CAP,atletico-pr,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,180250,29,282,315,1,0,Casa,2014,282,atlético-mg,CAM,atletico-mg,2014
376,180251,29,287,283,0,1,Visitante,2014,287,vitória,VIT,vitoria,2014
377,180252,29,285,264,1,2,Visitante,2014,285,internacional,INT,internacional,2014
378,180253,29,316,294,4,0,Casa,2014,316,figueirense,FIG,figueirense,2014


In [None]:
scouts_atletas = pontos_por_atleta.join(jogadores_2014_parquet, pontos_por_atleta.Atleta == jogadores_2014_parquet.ID)
scouts_atletas = scouts_atletas.sort(scouts_atletas.SomaPontos.desc())
scouts_atletas.toPandas()

In [None]:
# -        Quantas partidas resultaram em empate?
pontos_por_atleta = scouts_raw_2014_ano.groupBy("Atleta").agg(sum("Pontos").alias("SomaPontos"))

#count_result = count_result.withColumn('total_wins', count_result['count(1)'])

pontos_por_atleta = pontos_por_atleta.sort(pontos_por_atleta.SomaPontos.desc())

#count_result = count_result.drop('count(1)')

#count_result.show(5)


pontos_por_atleta.toPandas()