Pokemon Dataset

In [1]:
import pandas as pd

df = pd.read_csv('pokemon_data.csv')
#can also do pd.read_excel('path.xlsx') OR pd.read_csv('path.txt', delimiter = '\t') for tab delimiter

print(df.head(3))

   #       Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  \
0  1  Bulbasaur  Grass  Poison  45      49       49       65       65     45   
1  2    Ivysaur  Grass  Poison  60      62       63       80       80     60   
2  3   Venusaur  Grass  Poison  80      82       83      100      100     80   

   Generation  Legendary  
0           1      False  
1           1      False  
2           1      False  


Reading Data using Pandas

In [2]:
# Read Headers
print('Column Names')
print(df.columns)

# Read each Column
print('\nFirst 5 rows of Name Column')
print(df['Name'][0:5]) # Reads column Name, outputs first 5 rows. Can also do df.Name
print('\n3 Columns')
print(df[['Name', 'Type 1', 'HP']][0:3]) # Change the input from a string to a list, keep the [] brackets.

# Read each Row
print('\nUsing Integer location function')
print(df.iloc[0:3]) # picks up row 0,1,2


Column Names
Index(['#', 'Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp. Atk',
       'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')

First 5 rows of Name Column
0                Bulbasaur
1                  Ivysaur
2                 Venusaur
3    VenusaurMega Venusaur
4               Charmander
Name: Name, dtype: object

3 Columns
        Name Type 1  HP
0  Bulbasaur  Grass  45
1    Ivysaur  Grass  60
2   Venusaur  Grass  80

Using Integer location function
   #       Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  \
0  1  Bulbasaur  Grass  Poison  45      49       49       65       65     45   
1  2    Ivysaur  Grass  Poison  60      62       63       80       80     60   
2  3   Venusaur  Grass  Poison  80      82       83      100      100     80   

   Generation  Legendary  
0           1      False  
1           1      False  
2           1      False  


loc Function and Iterating through Rows

In [3]:
# iloc is used for integer indexing vs loc is used for label indexing

# Read a specific location (column,row) = (2,1)
print('\niloc function Column 2, Row 1')
print(df.iloc[2,1])

# Filtering the df and pulling those rows. df['Type 1'] == "Fire" returns a column of True/False for each row depending if Type 1 is Fire or not
# df.loc function uses this True/False column to return the rows with values True in the inputted column
print("\nloc function")
print(df.loc[df['Type 1'] == "Fire"][0:2]) # only get top 2 values for space of output concerns

print("\nItterate through rows function")
#for index, row in df.iterrows():
#    print(index, row) #instead of printing the index and row, you can also do some manipulations to make a calculation based on columns for each row


iloc function Column 2, Row 1
Venusaur

loc function
   #        Name Type 1 Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  \
4  4  Charmander   Fire    NaN  39      52       43       60       50     65   
5  5  Charmeleon   Fire    NaN  58      64       58       80       65     80   

   Generation  Legendary  
4           1      False  
5           1      False  

Itterate through rows function


Sorting/Describing Data

In [4]:
print("useful function for general stats on data")
print(df.describe())
print("\n")

print("Sorting Descending")
print(df.sort_values('Name',ascending= False)[0:2])
print("\n")

print("Sorting multiple columns")
print(df.sort_values(['Type 1', 'HP'], ascending = [1,0])[0:2]) # [1,0] for ascending Type1 and descending HP. 0:2 for 2 records only.
print("\n")

useful function for general stats on data
                #          HP      Attack     Defense     Sp. Atk     Sp. Def  \
count  800.000000  800.000000  800.000000  800.000000  800.000000  800.000000   
mean   362.813750   69.258750   79.001250   73.842500   72.820000   71.902500   
std    208.343798   25.534669   32.457366   31.183501   32.722294   27.828916   
min      1.000000    1.000000    5.000000    5.000000   10.000000   20.000000   
25%    184.750000   50.000000   55.000000   50.000000   49.750000   50.000000   
50%    364.500000   65.000000   75.000000   70.000000   65.000000   70.000000   
75%    539.250000   80.000000  100.000000   90.000000   95.000000   90.000000   
max    721.000000  255.000000  190.000000  230.000000  194.000000  230.000000   

            Speed  Generation  
count  800.000000   800.00000  
mean    68.277500     3.32375  
std     29.060474     1.66129  
min      5.000000     1.00000  
25%     45.000000     2.00000  
50%     65.000000     3.00000  
75% 

Making Changes to the data


In [5]:
df['Total'] = df['HP'] + df['Attack'] + df ['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed'] # making an overall stat

print(df.head(3))
print("\n")

df = df.drop(columns = ['Total'])
print(df.head(3))
print("\n")

# Can also make a column using iloc
df['Total'] = df.iloc[:, 4:10].sum(axis=1) # All rows, column 4 to 9. Axis = 1 for adding Horizontally, Axis = 0 for adding vertically
print(df.head(3))
print("\n")

# Move this column over to a specific spot
cols = list(df.columns) # puts columns in a list
df = df[cols[0:4] + [cols[-1]] + cols[4:12]]
print(df.head(3))

   #       Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  \
0  1  Bulbasaur  Grass  Poison  45      49       49       65       65     45   
1  2    Ivysaur  Grass  Poison  60      62       63       80       80     60   
2  3   Venusaur  Grass  Poison  80      82       83      100      100     80   

   Generation  Legendary  Total  
0           1      False    318  
1           1      False    405  
2           1      False    525  


   #       Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  \
0  1  Bulbasaur  Grass  Poison  45      49       49       65       65     45   
1  2    Ivysaur  Grass  Poison  60      62       63       80       80     60   
2  3   Venusaur  Grass  Poison  80      82       83      100      100     80   

   Generation  Legendary  
0           1      False  
1           1      False  
2           1      False  


   #       Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  \
0  1  Bulbasaur  Grass  Poison

Creating an excel file of modified data

In [6]:
# df.to_csv('modified_data.csv', index = False) # Writes to a csv file, index = False does not create an index when saving, (we already have a columne for index)

# can also use df.to_excel('modified_data.xlsx, index = False)
# can also use df.to_csv('modified_data.txt, index = False, sep ='\t'). Default seperator is commas.