In [2]:
#import libraries
import pandas as pd


### Read the Data

In [3]:
#read data 
data = pd.read_csv("../data/steam_data.csv")
data.head()

Unnamed: 0,appid,name,release_year,release_date,genres,categories,price,recommendations,developer,publisher
0,3057270,Seafarer's Gambit,2024,"Jul 5, 2024",Action;Adventure;Indie;RPG;Strategy,Single-player;Family Sharing,3.99,0,Bouncy Rocket Studios,Bouncy Rocket Studios
1,3822840,Capitalist Misadventures,2025,"Jul 25, 2025",Casual;Indie;Simulation;Strategy,Single-player;Save Anytime;Family Sharing,7.99,0,Caramelo Studios,Caramelo Studios
2,3216640,The Beast and the Princess,2025,"Jun 17, 2025",Adventure;Indie;Strategy,Single-player;Steam Achievements;Full controll...,12.99,0,Libragames,Libragames
3,2403620,Air Twister,2023,"Nov 10, 2023",Action;Adventure;Indie,Single-player;Steam Achievements;Full controll...,24.99,0,YS Net,ININ
4,1538040,Horde Slayer,2021,"Mar 19, 2021",Action;Adventure;Casual;Indie;RPG;Early Access,Single-player;Steam Achievements;Full controll...,3.99,0,Wagner Rodrigues,Wagner Rodrigues


In [4]:
#check info and nulls
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65521 entries, 0 to 65520
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   appid            65521 non-null  int64  
 1   name             65521 non-null  object 
 2   release_year     65521 non-null  int64  
 3   release_date     65521 non-null  object 
 4   genres           65455 non-null  object 
 5   categories       65514 non-null  object 
 6   price            65521 non-null  float64
 7   recommendations  65521 non-null  int64  
 8   developer        65468 non-null  object 
 9   publisher        65338 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 5.0+ MB


### Clean the Genres Column

In [5]:
#check for missing genres
print('The missing genres:',data['genres'].isnull().sum())
#check for mode
print('Top genre:',data['genres'].mode()[0])
#replace missing with the most occurring genre
data["genres"] = data["genres"].fillna("Casual;Indie")
print("The missing genres:", data["genres"].isnull().sum())


The missing genres: 66
Top genre: Casual;Indie
The missing genres: 0


### Clean the Release Date Column

In [6]:
#change name pof appid to app_id
data = data.rename(columns={'appid':'app_id', 'release_date':'release_month'})

#get the month from the date
data['release_month'] = data['release_month'].apply(lambda x: x.strip())
data['release_month'] = data['release_month'].str[0:3]
#replace 2025 month with the most common month Oct
data["release_month"] = data["release_month"].replace('2025', 'Oct')
#replace Dece with the right Dec spelling
data["release_month"] = data["release_month"].replace('202', 'Dec').replace('Q4 ', 'Oct')
data['release_month'] = data['release_month'].map({'Jan': 'January',
    'Feb': 'February',
    'Mar': 'March',
    'Apr': 'April',
    'May': 'May',
    'Jun': 'June',
    'Jul': 'July',
    'Aug': 'August',
    'Sep': 'September',
    'Oct': 'October',
    'Nov': 'November',
    'Dec': 'December'})
data["release_month"].value_counts()

release_month
October      6804
December     6311
November     6239
August       5575
July         5466
May          5404
March        5354
September    5283
June         5060
April        5040
February     4660
January      4325
Name: count, dtype: int64

### Clean Price Column

In [7]:
#check for nulls or zeros
print('Missing Values:',data['price'].isnull().sum())
#check for median price
median_price = data['price'].median()
print('median_price:',median_price)
#replace the zeros with median values
data['price'] = data['price'].replace(0,median_price)
data['price']

Missing Values: 0
median_price: 3.99


0         3.99
1         7.99
2        12.99
3        24.99
4         3.99
         ...  
65516    14.99
65517     4.99
65518    14.99
65519    15.99
65520     0.99
Name: price, Length: 65521, dtype: float64

### Clean the categories column 

In [8]:
print('Total missing category:',data['categories'].isnull().sum())
print('Most occurring category:', data['categories'].mode())
top_category = data["categories"].mode()[0]
data['categories'] = data['categories'].fillna(top_category)
print("Total missing category:", data["categories"].isnull().sum())

Total missing category: 7
Most occurring category: 0    Single-player;Family Sharing
Name: categories, dtype: object
Total missing category: 0


### Clean Developer Column

In [9]:
print("Total missing developer:", data["developer"].isnull().sum())
print("Most occurring developer:", data["developer"].mode()[0])
top_developer = data["developer"].mode()[0]
data["developer"] = data["developer"].fillna(top_category)
print("Total missing developer:", data["developer"].isnull().sum())

Total missing developer: 53
Most occurring developer: EroticGamesClub
Total missing developer: 0


In [10]:
print("Total missing publisher:", data["publisher"].isnull().sum())
print("Most occurring publisher:", data["publisher"].mode()[0])
top_publisher = data["publisher"].mode()[0]
data["publisher"] = data["publisher"].fillna(top_publisher)
print("Total missing publisher:", data["publisher"].isnull().sum())

Total missing publisher: 183
Most occurring publisher: EroticGamesClub
Total missing publisher: 0
