In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)

# Read the dataset
df = pd.read_csv('../data/pokemon.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

print("\nColumns in our dataset:")
print(df.columns.tolist())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Let's look at the distribution of our numerical columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
print("\nNumerical columns:")
print(numeric_columns)

# Basic statistics of numerical columns
print("\nBasic statistics:")
print(df[numeric_columns].describe())

# Split data into train and test (75/25 split)
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)

print("\nTraining set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Create visualizations of Pokemon stats
plt.figure(figsize=(12, 6))
df[['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed']].boxplot()
plt.title('Distribution of Pokemon Base Stats')
plt.ylabel('Stat Value')
plt.xticks(rotation=45)
plt.show()

# Let's also create a correlation heatmap for the stats
plt.figure(figsize=(10, 8))
stats_cols = ['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed']
correlation_matrix = df[stats_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation between Pokemon Stats')
plt.tight_layout()
plt.show()

# Distribution of Pokemon types
plt.figure(figsize=(12, 6))
df['type1'].value_counts().plot(kind='bar')
plt.title('Distribution of Primary Pokemon Types')
plt.xlabel('Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Legendary vs Non-Legendary Pokemon stats comparison
plt.figure(figsize=(12, 6))
df.groupby('is_legendary')[stats_cols].mean().plot(kind='bar')
plt.title('Average Stats: Legendary vs Non-Legendary Pokemon')
plt.xlabel('Legendary Status')
plt.ylabel('Average Stat Value')
plt.xticks(rotation=0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

Dataset Shape: (801, 41)

First few rows:
                     abilities  against_bug  against_dark  against_dragon  \
0  ['Overgrow', 'Chlorophyll']          1.0           1.0             1.0   
1  ['Overgrow', 'Chlorophyll']          1.0           1.0             1.0   
2  ['Overgrow', 'Chlorophyll']          1.0           1.0             1.0   
3     ['Blaze', 'Solar Power']          0.5           1.0             1.0   
4     ['Blaze', 'Solar Power']          0.5           1.0             1.0   

   against_electric  against_fairy  against_fight  against_fire  \
0               0.5            0.5            0.5           2.0   
1               0.5            0.5            0.5           2.0   
2               0.5            0.5            0.5           2.0   
3               1.0            0.5            1.0           0.5   
4               1.0            0.5            1.0           0.5   

   against_flying  against_ghost  ...  percentage_male  pokedex_number  \
0             2.0 

KeyError: "None of [Index(['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'], dtype='object')] are in the [columns]"

<Figure size 1200x600 with 0 Axes>