# 06 â€” Pandas Power Skills (1DownLabs)

Focus:
- selecting & filtering
- sorting
- groupby aggregations
- joins/merges
- pivot tables
- handling missing values
- building a mini analytics report

In [1]:
import sys
from pathlib import Path

project_root = Path("..").resolve()
sys.path.append(str(project_root))

import pandas as pd

raw_path = project_root / "data" / "raw" / "vgsales.csv"
df = pd.read_csv(raw_path)

df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [4]:
# Shape of the dataframe
df.shape

(16598, 11)

In [5]:
# Check Columns
df.columns

Index(['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'],
      dtype='str')

In [6]:
# Check data type of columns

df.dtypes

Rank              int64
Name                str
Platform            str
Year            float64
Genre               str
Publisher           str
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales    float64
dtype: object

In [7]:
# Check for null values
df.isna().sum().sort_values(ascending=False)

Year            271
Publisher        58
Rank              0
Platform          0
Name              0
Genre             0
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

In [8]:
# Clean column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

df.columns

Index(['rank', 'name', 'platform', 'year', 'genre', 'publisher', 'na_sales',
       'eu_sales', 'jp_sales', 'other_sales', 'global_sales'],
      dtype='str')

In [9]:
# Check the top 10 rows of selected columns ['rank', 'name', 'platform', 'year', 'genre']

df[['rank', 'name', 'platform', 'year', 'genre']].head(10)

Unnamed: 0,rank,name,platform,year,genre
0,1,Wii Sports,Wii,2006.0,Sports
1,2,Super Mario Bros.,NES,1985.0,Platform
2,3,Mario Kart Wii,Wii,2008.0,Racing
3,4,Wii Sports Resort,Wii,2009.0,Sports
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing
5,6,Tetris,GB,1989.0,Puzzle
6,7,New Super Mario Bros.,DS,2006.0,Platform
7,8,Wii Play,Wii,2006.0,Misc
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform
9,10,Duck Hunt,NES,1984.0,Shooter


In [12]:
#  Check top 10 games with global_sales > 2 million

high_sales = df[df['global_sales'] > 2]
high_sales[['name', 'platform', 'year', 'genre', 'publisher', 'global_sales']].sort_values('global_sales',ascending=False).head(10)

Unnamed: 0,name,platform,year,genre,publisher,global_sales
0,Wii Sports,Wii,2006.0,Sports,Nintendo,82.74
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,40.24
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,35.82
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,33.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,31.37
5,Tetris,GB,1989.0,Puzzle,Nintendo,30.26
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,30.01
7,Wii Play,Wii,2006.0,Misc,Nintendo,29.02
8,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,28.62
9,Duck Hunt,NES,1984.0,Shooter,Nintendo,28.31


In [19]:
# Count number of records > 2 million
high_sales.shape

(846, 11)

In [14]:
# Sorting + top-N

top_10 = df.sort_values('global_sales', ascending= False)
top_10[['name', 'platform', 'year', 'genre', 'publisher', 'global_sales']].head(10)

Unnamed: 0,name,platform,year,genre,publisher,global_sales
0,Wii Sports,Wii,2006.0,Sports,Nintendo,82.74
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,40.24
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,35.82
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,33.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,31.37
5,Tetris,GB,1989.0,Puzzle,Nintendo,30.26
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,30.01
7,Wii Play,Wii,2006.0,Misc,Nintendo,29.02
8,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,28.62
9,Duck Hunt,NES,1984.0,Shooter,Nintendo,28.31


In [20]:
# How many missing years?

df['year'].isna().sum()

np.int64(271)

In [21]:
# Keep only rows with a valid year
df_clean = df.dropna(subset=["year"]).copy()
df_clean["year"] = df_clean["year"].astype(int)
df_clean.shape

(16327, 11)

In [25]:
# Group by top publishers by global sales

top_publishers = (df_clean.groupby('publisher', as_index=False)
                    .agg(total_sales = ('global_sales', 'sum'),
                    game_count = ('name', 'count'))
                    .sort_values('total_sales', ascending=False)
            )

top_publishers.head(10)

Unnamed: 0,publisher,total_sales,game_count
359,Nintendo,1784.43,696
138,Electronic Arts,1093.39,1339
21,Activision,721.41,966
455,Sony Computer Entertainment,607.28,682
524,Ubisoft,473.54,918
493,Take-Two Interactive,399.3,412
487,THQ,340.44,712
275,Konami Digital Entertainment,278.56,823
445,Sega,270.7,632
347,Namco Bandai Games,253.65,928


In [26]:
# 'na_sales',
    #    'eu_sales', 'jp_sales', 'other_sales', 'global_sales'
yearly_sales = (df_clean.groupby('year', as_index=False)
                    .agg(total_na_sales = ('na_sales', 'sum'),
                    total_eu_sales = ('eu_sales', 'sum'),
                    total_jp_sales = ('jp_sales', 'sum'),
                    total_other_sales = ('other_sales', 'sum'),
                    total_global_sales = ('global_sales', 'sum'))
                    .sort_values('year', ascending=True)
                    )

yearly_sales

Unnamed: 0,year,total_na_sales,total_eu_sales,total_jp_sales,total_other_sales,total_global_sales
0,1980,10.59,0.67,0.0,0.12,11.38
1,1981,33.4,1.96,0.0,0.32,35.77
2,1982,26.92,1.65,0.0,0.31,28.86
3,1983,7.76,0.8,8.1,0.14,16.79
4,1984,33.28,2.1,14.27,0.7,50.36
5,1985,33.73,4.74,14.56,0.92,53.94
6,1986,12.5,2.84,19.81,1.93,37.07
7,1987,8.46,1.41,11.63,0.2,21.74
8,1988,23.87,6.59,15.76,0.99,47.22
9,1989,45.15,8.44,18.36,1.5,73.45
