# 06 — Pandas Power Skills (1DownLabs)

Focus:
- selecting & filtering
- sorting
- groupby aggregations
- joins/merges
- pivot tables
- handling missing values
- building a mini analytics report

In [1]:
import sys
from pathlib import Path

project_root = Path("..").resolve()
sys.path.append(str(project_root))

import pandas as pd

raw_path = project_root / "data" / "raw" / "vgsales.csv"
df = pd.read_csv(raw_path)

df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [4]:
# Shape of the dataframe
df.shape

(16598, 11)

In [5]:
# Check Columns
df.columns

Index(['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'],
      dtype='str')

In [6]:
# Check data type of columns

df.dtypes

Rank              int64
Name                str
Platform            str
Year            float64
Genre               str
Publisher           str
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales    float64
dtype: object

In [7]:
# Check for null values
df.isna().sum().sort_values(ascending=False)

Year            271
Publisher        58
Rank              0
Platform          0
Name              0
Genre             0
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

In [8]:
# Clean column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

df.columns

Index(['rank', 'name', 'platform', 'year', 'genre', 'publisher', 'na_sales',
       'eu_sales', 'jp_sales', 'other_sales', 'global_sales'],
      dtype='str')

In [9]:
# Check the top 10 rows of selected columns ['rank', 'name', 'platform', 'year', 'genre']

df[['rank', 'name', 'platform', 'year', 'genre']].head(10)

Unnamed: 0,rank,name,platform,year,genre
0,1,Wii Sports,Wii,2006.0,Sports
1,2,Super Mario Bros.,NES,1985.0,Platform
2,3,Mario Kart Wii,Wii,2008.0,Racing
3,4,Wii Sports Resort,Wii,2009.0,Sports
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing
5,6,Tetris,GB,1989.0,Puzzle
6,7,New Super Mario Bros.,DS,2006.0,Platform
7,8,Wii Play,Wii,2006.0,Misc
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform
9,10,Duck Hunt,NES,1984.0,Shooter


In [12]:
#  Check top 10 games with global_sales > 2 million

high_sales = df[df['global_sales'] > 2]
high_sales[['name', 'platform', 'year', 'genre', 'publisher', 'global_sales']].sort_values('global_sales',ascending=False).head(10)

Unnamed: 0,name,platform,year,genre,publisher,global_sales
0,Wii Sports,Wii,2006.0,Sports,Nintendo,82.74
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,40.24
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,35.82
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,33.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,31.37
5,Tetris,GB,1989.0,Puzzle,Nintendo,30.26
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,30.01
7,Wii Play,Wii,2006.0,Misc,Nintendo,29.02
8,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,28.62
9,Duck Hunt,NES,1984.0,Shooter,Nintendo,28.31


In [19]:
# Count number of records > 2 million
high_sales.shape

(846, 11)

In [14]:
# Sorting + top-N

top_10 = df.sort_values('global_sales', ascending= False)
top_10[['name', 'platform', 'year', 'genre', 'publisher', 'global_sales']].head(10)

Unnamed: 0,name,platform,year,genre,publisher,global_sales
0,Wii Sports,Wii,2006.0,Sports,Nintendo,82.74
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,40.24
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,35.82
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,33.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,31.37
5,Tetris,GB,1989.0,Puzzle,Nintendo,30.26
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,30.01
7,Wii Play,Wii,2006.0,Misc,Nintendo,29.02
8,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,28.62
9,Duck Hunt,NES,1984.0,Shooter,Nintendo,28.31


In [20]:
# How many missing years?

df['year'].isna().sum()

np.int64(271)

In [21]:
# Keep only rows with a valid year
df_clean = df.dropna(subset=["year"]).copy()
df_clean["year"] = df_clean["year"].astype(int)
df_clean.shape

(16327, 11)

In [25]:
# Group by top publishers by global sales

top_publishers = (df_clean.groupby('publisher', as_index=False)
                    .agg(total_sales = ('global_sales', 'sum'),
                    game_count = ('name', 'count'))
                    .sort_values('total_sales', ascending=False)
            )

top_publishers.head(10)

Unnamed: 0,publisher,total_sales,game_count
359,Nintendo,1784.43,696
138,Electronic Arts,1093.39,1339
21,Activision,721.41,966
455,Sony Computer Entertainment,607.28,682
524,Ubisoft,473.54,918
493,Take-Two Interactive,399.3,412
487,THQ,340.44,712
275,Konami Digital Entertainment,278.56,823
445,Sega,270.7,632
347,Namco Bandai Games,253.65,928


In [28]:
# Multi aggregation by year

yearly_sales = (df_clean.groupby('year', as_index=False)
                    .agg(total_na_sales = ('na_sales', 'sum'),
                    total_eu_sales = ('eu_sales', 'sum'),
                    total_jp_sales = ('jp_sales', 'sum'),
                    total_other_sales = ('other_sales', 'sum'),
                    total_global_sales = ('global_sales', 'sum'))
                    .sort_values('year', ascending=True)
                    )

yearly_sales.head()

Unnamed: 0,year,total_na_sales,total_eu_sales,total_jp_sales,total_other_sales,total_global_sales
0,1980,10.59,0.67,0.0,0.12,11.38
1,1981,33.4,1.96,0.0,0.32,35.77
2,1982,26.92,1.65,0.0,0.31,28.86
3,1983,7.76,0.8,8.1,0.14,16.79
4,1984,33.28,2.1,14.27,0.7,50.36


In [29]:
# Pivot Sales : Genre Sales by Year

genre_by_year = pd.pivot_table(
    df_clean,
    index="year",
    columns="genre",
    values="global_sales",
    aggfunc="sum",
    fill_value=0
)

genre_by_year.head()

genre,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980,0.34,0.0,0.77,2.71,0.0,0.0,0.0,0.0,7.07,0.0,0.49,0.0
1981,14.84,0.0,0.0,0.0,6.93,2.24,0.48,0.0,10.04,0.45,0.79,0.0
1982,6.52,0.0,0.0,0.87,5.03,10.03,1.57,0.0,3.79,0.0,1.05,0.0
1983,2.86,0.4,0.0,2.14,6.93,0.78,0.0,0.0,0.48,0.0,3.2,0.0
1984,1.85,0.0,0.0,1.45,0.69,3.14,5.95,0.0,31.1,0.0,6.18,0.0


In [30]:
# “Top genre each year”

top_genre_each_year = (
    genre_by_year
    .idxmax(axis=1)
    .to_frame("top_genre")
    .join(genre_by_year.max(axis=1).to_frame("top_genre_sales"))
    .reset_index()
)

top_genre_each_year.head()

Unnamed: 0,year,top_genre,top_genre_sales
0,1980,Shooter,7.07
1,1981,Action,14.84
2,1982,Puzzle,10.03
3,1983,Platform,6.93
4,1984,Shooter,31.1


In [50]:
# Merge/join
# Let’s create a “publisher tier” table and join it.

publisher_tier = top_publishers.head(20)[["publisher", "total_sales"]].copy()

publisher_tier["tier"] = "Top 20"

publisher_tier

Unnamed: 0,publisher,total_sales,tier
359,Nintendo,1784.43,Top 20
138,Electronic Arts,1093.39,Top 20
21,Activision,721.41,Top 20
455,Sony Computer Entertainment,607.28,Top 20
524,Ubisoft,473.54,Top 20
493,Take-Two Interactive,399.3,Top 20
487,THQ,340.44,Top 20
275,Konami Digital Entertainment,278.56,Top 20
445,Sega,270.7,Top 20
347,Namco Bandai Games,253.65,Top 20


In [51]:
df_clean.head()

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [60]:
df_clean_merged = df_clean.merge(publisher_tier[["tier","publisher"]], how='left', on='publisher')
# df_clean_merged["tier"] = df_clean_merged["tier"].fillna('Other')
df_clean_merged.head()

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales,tier
0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,Top 20
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,Top 20
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,Top 20
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,Top 20
4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,Top 20


In [69]:
'''Practice 1 — Top 10 platforms by global sales

Output columns:
                platform
                total_global_sales
                game_count'''

top_10_by_global_sales = (df_clean.groupby('platform')
                            .agg(total_global_sales = ("global_sales","sum"),
                            game_count = ("name", "count"))
                            .sort_values('platform', ascending=False))
                            

top_10_by_global_sales = top_10_by_global_sales.sort_values('total_global_sales', ascending=False).head(10)

top_10_by_global_sales


Unnamed: 0_level_0,total_global_sales,game_count
platform,Unnamed: 1_level_1,Unnamed: 2_level_1
PS2,1233.46,2127
X360,969.61,1235
PS3,949.35,1304
Wii,909.81,1290
DS,818.96,2133
PS,727.39,1189
GBA,313.56,811
PSP,291.71,1197
PS4,278.1,336
PC,255.05,943


In [83]:
'''Practice 2 — Best selling game per year

For each year, find the game with max global_sales.

Output columns:

year

name

platform

global_sales'''

df_clean['sales_rank'] =  (df_clean.groupby('year')['global_sales'].rank(method='min', ascending=False))

df_clean[df_clean['sales_rank'] == 1].sort_values('year')

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales,sales_rank
258,259,Asteroids,2600,1980,Shooter,Atari,4.0,0.26,0.0,0.05,4.31,1.0
239,240,Pitfall!,2600,1981,Platform,Activision,4.21,0.24,0.0,0.05,4.5,1.0
89,90,Pac-Man,2600,1982,Puzzle,Atari,7.28,0.45,0.0,0.08,7.81,1.0
421,422,Baseball,NES,1983,Sports,Nintendo,0.73,0.1,2.35,0.02,3.2,1.0
9,10,Duck Hunt,NES,1984,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31,1.0
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,1.0
127,128,The Legend of Zelda,NES,1986,Action,Nintendo,3.74,0.93,1.69,0.14,6.51,1.0
251,252,Zelda II: The Adventure of Link,NES,1987,Adventure,Nintendo,2.19,0.5,1.61,0.08,4.38,1.0
22,23,Super Mario Bros. 3,NES,1988,Platform,Nintendo,9.54,3.44,3.84,0.46,17.28,1.0
5,6,Tetris,GB,1989,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26,1.0


In [98]:
'''Practice 3 — Regional mix

For the top 10 publishers by global sales, compute:
%NA
%EU
%JP
%Other'''

top_10_publisher = (df_clean.groupby('publisher').agg(total_global_sales = ("global_sales" , "sum"),
                                                        total_na_sales = ("na_sales" , "sum"),
                                                        total_eu_sales = ('eu_sales' , 'sum'),
                                                        total_jp_sales = ('jp_sales', 'sum'),
                                                        total_other_sales = ('other_sales', 'sum'))
                                                .sort_values('total_global_sales', ascending=False).head(10))
top_10_publisher

Unnamed: 0_level_0,total_global_sales,total_na_sales,total_eu_sales,total_jp_sales,total_other_sales
publisher,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nintendo,1784.43,815.75,418.3,454.99,95.19
Electronic Arts,1093.39,584.22,367.38,13.98,127.63
Activision,721.41,426.01,213.72,6.54,74.79
Sony Computer Entertainment,607.28,265.22,187.55,74.1,80.4
Ubisoft,473.54,252.81,163.03,7.33,50.16
Take-Two Interactive,399.3,220.47,117.95,5.83,55.2
THQ,340.44,208.6,94.6,5.01,32.11
Konami Digital Entertainment,278.56,88.91,68.62,90.93,29.91
Sega,270.7,108.78,81.41,56.19,24.3
Namco Bandai Games,253.65,69.38,42.61,126.84,14.64


In [100]:
top_10_publisher['%_NA'] = top_10_publisher['total_na_sales'] / top_10_publisher['total_global_sales'] 
top_10_publisher['%_EU'] = top_10_publisher['total_eu_sales'] / top_10_publisher['total_global_sales'] 
top_10_publisher['%_JP'] = top_10_publisher['total_jp_sales'] / top_10_publisher['total_global_sales'] 
top_10_publisher['%_Other'] = top_10_publisher['total_other_sales'] / top_10_publisher['total_global_sales'] 

top_10_publisher = top_10_publisher.reset_index()[['publisher' ,'%_NA', '%_EU', '%_Other', '%_JP']]

In [101]:
top_10_publisher

Unnamed: 0,publisher,%_NA,%_EU,%_Other,%_JP
0,Nintendo,0.457149,0.234417,0.053345,0.254978
1,Electronic Arts,0.53432,0.336001,0.116729,0.012786
2,Activision,0.590524,0.296253,0.103672,0.009066
3,Sony Computer Entertainment,0.436734,0.308836,0.132394,0.122019
4,Ubisoft,0.533873,0.344279,0.105926,0.015479
5,Take-Two Interactive,0.552141,0.295392,0.138242,0.014601
6,THQ,0.612736,0.277876,0.094319,0.014716
7,Konami Digital Entertainment,0.319177,0.246338,0.107374,0.326429
8,Sega,0.401847,0.300739,0.089767,0.207573
9,Namco Bandai Games,0.273527,0.167987,0.057717,0.500059


## Summary

This notebook covered core Pandas skills:
- filtering, sorting, and selecting columns
- groupby aggregations and multi-metrics
- pivot tables and derived insights
- merges/joins
- saving processed outputs + writing reports
