- Cleaning and joining datasets
- Creating csv files for EDA and for training

In [1]:
def hscroll(activate=True):
  """activate/deactivate horizontal scrolling for wide output cells"""
  from IPython.display import display, HTML
  style = ('pre-wrap','pre')[activate] # select white-space style
  display(HTML("<style>pre {white-space: %s !important}</style>" % style))

hscroll()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[1]")\
    .appName("eda") \
    .getOrCreate()


### File 1: pokemon.csv

In [3]:
pokemon=spark.read.csv("data/pokemon.csv", header=True, inferSchema=True)

In [4]:
pokemon.printSchema()

root
 |-- #: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Type 1: string (nullable = true)
 |-- Type 2: string (nullable = true)
 |-- HP: integer (nullable = true)
 |-- Attack: integer (nullable = true)
 |-- Defense: integer (nullable = true)
 |-- Sp. Atk: integer (nullable = true)
 |-- Sp. Def: integer (nullable = true)
 |-- Speed: integer (nullable = true)
 |-- Generation: integer (nullable = true)
 |-- Legendary: boolean (nullable = true)



In [5]:
pokemon.count()

800

In [6]:
cols1=list(pokemon.schema.names)

In [7]:
from pyspark.sql.functions import countDistinct
df=pokemon.select(countDistinct("Name"))
df.show()

+--------------------+
|count(DISTINCT Name)|
+--------------------+
|                 800|
+--------------------+



### File 2: pokedex.csv

In [8]:
pokedex=spark.read.csv("data/pokedex.csv", header=True, inferSchema=True)
pokedex=pokedex.drop('german_name')
pokedex=pokedex.drop('japanese_name')
pokedex.printSchema()

root
 |-- pokedex_number: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- generation: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- species: string (nullable = true)
 |-- type_number: integer (nullable = true)
 |-- type_1: string (nullable = true)
 |-- type_2: string (nullable = true)
 |-- height_m: double (nullable = true)
 |-- weight_kg: double (nullable = true)
 |-- abilities_number: integer (nullable = true)
 |-- ability_1: string (nullable = true)
 |-- ability_2: string (nullable = true)
 |-- ability_hidden: string (nullable = true)
 |-- total_points: integer (nullable = true)
 |-- hp: integer (nullable = true)
 |-- attack: integer (nullable = true)
 |-- defense: integer (nullable = true)
 |-- sp_attack: integer (nullable = true)
 |-- sp_defense: integer (nullable = true)
 |-- speed: integer (nullable = true)
 |-- catch_rate: integer (nullable = true)
 |-- base_friendship: integer (nullable = true)
 |-- base_experience: integer (nullable =

In [9]:
pokedex.count()

1045

Joining pokemon and pokedex

In [10]:
pokemon=pokemon.join(pokedex, pokemon.Name==pokedex.name, how="inner")

In [11]:
pokemon.count()

793

In [12]:
#Turn on case sensitivity while dropping columns
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark.sparkContext)
sqlContext.sql("set spark.sql.caseSensitive=true")



DataFrame[key: string, value: string]

In [13]:
#Drop repeating columns
cols_to_drop=cols1[1:-1]
cols_to_drop.append("pokedex_number")
pokemon=pokemon.drop(*cols_to_drop)

In [14]:
pokemon.printSchema()

root
 |-- #: integer (nullable = true)
 |-- Legendary: boolean (nullable = true)
 |-- name: string (nullable = true)
 |-- generation: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- species: string (nullable = true)
 |-- type_number: integer (nullable = true)
 |-- type_1: string (nullable = true)
 |-- type_2: string (nullable = true)
 |-- height_m: double (nullable = true)
 |-- weight_kg: double (nullable = true)
 |-- abilities_number: integer (nullable = true)
 |-- ability_1: string (nullable = true)
 |-- ability_2: string (nullable = true)
 |-- ability_hidden: string (nullable = true)
 |-- total_points: integer (nullable = true)
 |-- hp: integer (nullable = true)
 |-- attack: integer (nullable = true)
 |-- defense: integer (nullable = true)
 |-- sp_attack: integer (nullable = true)
 |-- sp_defense: integer (nullable = true)
 |-- speed: integer (nullable = true)
 |-- catch_rate: integer (nullable = true)
 |-- base_friendship: integer (nullable = true)
 |-- base_ex

In [15]:
pokemon.count()

793

In [16]:
from pyspark.sql.functions import countDistinct
df2=pokemon.select(countDistinct("#"))
df2.show()

+-----------------+
|count(DISTINCT #)|
+-----------------+
|              793|
+-----------------+



### File 3: combats.csv

In [17]:
combats = spark.read.csv("data/combats.csv", header=True, inferSchema=True)

In [18]:
combats.count()

50000

In [19]:
combats.printSchema()

root
 |-- First_pokemon: integer (nullable = true)
 |-- Second_pokemon: integer (nullable = true)
 |-- Winner: integer (nullable = true)



In [20]:
from pyspark.sql.functions import countDistinct
df3=combats.select(countDistinct("First_pokemon"))
df3.show()

+-----------------------------+
|count(DISTINCT First_pokemon)|
+-----------------------------+
|                          784|
+-----------------------------+



### Joining Stats of First Pokemon to each combat

In [21]:
combats=combats.withColumnRenamed("First_pokemon", "#")
combats.printSchema()

root
 |-- #: integer (nullable = true)
 |-- Second_pokemon: integer (nullable = true)
 |-- Winner: integer (nullable = true)



In [22]:
poke1=combats.join(pokemon, on="#", how="inner")

In [23]:
poke1.count()

49556

In [24]:
poke1.show(n=5)

+---+--------------+------+---------+---------+----------+------+------------+-----------+------+------+--------+---------+----------------+---------+---------+--------------+------------+---+------+-------+---------+----------+-----+----------+---------------+---------------+-----------+---------------+----------+----------+---------------+----------+--------------+------------+-------------+----------------+-------------+-----------+-------------+--------------+--------------+--------------+---------------+-----------+------------+-------------+--------------+------------+-------------+-------------+
|  #|Second_pokemon|Winner|Legendary|     name|generation|status|     species|type_number|type_1|type_2|height_m|weight_kg|abilities_number|ability_1|ability_2|ability_hidden|total_points| hp|attack|defense|sp_attack|sp_defense|speed|catch_rate|base_friendship|base_experience|growth_rate|egg_type_number|egg_type_1|egg_type_2|percentage_male|egg_cycles|against_normal|against_fire|against_

In [25]:
#Checking
combats.filter(combats["#"]==1).count()

70

In [26]:
poke1.filter(pokemon["#"]==1).count()

70

### Joining Stats of second pokemon

In [27]:
poke2 = pokemon.withColumnRenamed("#", "Second_pokemon")
poke2.printSchema()

root
 |-- Second_pokemon: integer (nullable = true)
 |-- Legendary: boolean (nullable = true)
 |-- name: string (nullable = true)
 |-- generation: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- species: string (nullable = true)
 |-- type_number: integer (nullable = true)
 |-- type_1: string (nullable = true)
 |-- type_2: string (nullable = true)
 |-- height_m: double (nullable = true)
 |-- weight_kg: double (nullable = true)
 |-- abilities_number: integer (nullable = true)
 |-- ability_1: string (nullable = true)
 |-- ability_2: string (nullable = true)
 |-- ability_hidden: string (nullable = true)
 |-- total_points: integer (nullable = true)
 |-- hp: integer (nullable = true)
 |-- attack: integer (nullable = true)
 |-- defense: integer (nullable = true)
 |-- sp_attack: integer (nullable = true)
 |-- sp_defense: integer (nullable = true)
 |-- speed: integer (nullable = true)
 |-- catch_rate: integer (nullable = true)
 |-- base_friendship: integer (nullable = true)

In [28]:
#Renaming columns for Second Pokemon Stats
cols2=list(poke2.schema.names)
tmp=cols2[1:]
new_cols2=[cols2[0]]
new_cols2.extend([i+"_2" for i in tmp])

for i in range (len(cols2)):
    poke2=poke2.withColumnRenamed(cols2[i], new_cols2[i])

poke2.printSchema()

root
 |-- Second_pokemon: integer (nullable = true)
 |-- Legendary_2: boolean (nullable = true)
 |-- name_2: string (nullable = true)
 |-- generation_2: integer (nullable = true)
 |-- status_2: string (nullable = true)
 |-- species_2: string (nullable = true)
 |-- type_number_2: integer (nullable = true)
 |-- type_1_2: string (nullable = true)
 |-- type_2_2: string (nullable = true)
 |-- height_m_2: double (nullable = true)
 |-- weight_kg_2: double (nullable = true)
 |-- abilities_number_2: integer (nullable = true)
 |-- ability_1_2: string (nullable = true)
 |-- ability_2_2: string (nullable = true)
 |-- ability_hidden_2: string (nullable = true)
 |-- total_points_2: integer (nullable = true)
 |-- hp_2: integer (nullable = true)
 |-- attack_2: integer (nullable = true)
 |-- defense_2: integer (nullable = true)
 |-- sp_attack_2: integer (nullable = true)
 |-- sp_defense_2: integer (nullable = true)
 |-- speed_2: integer (nullable = true)
 |-- catch_rate_2: integer (nullable = true)
 |-

In [29]:
poke_df=poke1.join(poke2, on="Second_pokemon", how="inner")

In [30]:
poke_df.count()

49161

In [31]:
poke_df.show(n=5)

+--------------+---+------+---------+----------------+----------+------+-------------+-----------+------+------+--------+---------+----------------+-----------+---------+--------------+------------+---+------+-------+---------+----------+-----+----------+---------------+---------------+-----------+---------------+----------+----------+---------------+----------+--------------+------------+-------------+----------------+-------------+-----------+-------------+--------------+--------------+--------------+---------------+-----------+------------+-------------+--------------+------------+-------------+-------------+-----------+---------+------------+--------+-----------------+-------------+--------+--------+----------+-----------+------------------+------------+-----------+----------------+--------------+----+--------+---------+-----------+------------+-------+------------+-----------------+-----------------+-------------+-----------------+------------+------------+-----------------+------

In [32]:
#Winner=1 if First Pokemon is the Winner, 0 Otherwise
from pyspark.sql.functions import when
poke_df = poke_df.withColumn("Winner", when(poke_df["Winner"] == poke_df["#"], 1).otherwise(0))

In [33]:
poke_df=poke_df.withColumnRenamed("#", "First_pokemon")

In [34]:
poke_df.show(n=5)

+--------------+-------------+------+---------+----------------+----------+------+-------------+-----------+------+------+--------+---------+----------------+-----------+---------+--------------+------------+---+------+-------+---------+----------+-----+----------+---------------+---------------+-----------+---------------+----------+----------+---------------+----------+--------------+------------+-------------+----------------+-------------+-----------+-------------+--------------+--------------+--------------+---------------+-----------+------------+-------------+--------------+------------+-------------+-------------+-----------+---------+------------+--------+-----------------+-------------+--------+--------+----------+-----------+------------------+------------+-----------+----------------+--------------+----+--------+---------+-----------+------------+-------+------------+-----------------+-----------------+-------------+-----------------+------------+------------+--------------

### Writing dataframe to a csv for EDA

In [35]:
# poke_df.write.option("header",True).csv("data/final_data.csv")
poke_df.toPandas().to_csv("data/final_data.csv", header=True, index=False)

### Preparing training data for prediction

In [37]:
training_data=poke_df.drop("Second_pokemon", "#", "name", "name_2")

In [38]:
# training_data.write.option("header",True).csv("data/training_data.csv")
training_data.toPandas().to_csv("data/training_data.csv", header=True, index=False)