In [184]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

In [1]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://root:root@cluster0.qyyrcuj.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [195]:
# read data from csv
games = pd.read_csv('data/steam.csv')

In [204]:
# list first 10 rows to check data imported correctly
games.head(10)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,True,Valve,Valve,"[windows, mac, linux]",0,"[Multi-player, Online Multi-Player, Local Mult...",[Action],"[Action, FPS, Multiplayer]",0,124534,3339,17612,317,5000000.0,7.19
1,20,Team Fortress Classic,1999-04-01,True,Valve,Valve,"[windows, mac, linux]",0,"[Multi-player, Online Multi-Player, Local Mult...",[Action],"[Action, FPS, Multiplayer]",0,3318,633,277,62,2500000.0,3.99
2,30,Day of Defeat,2003-05-01,True,Valve,Valve,"[windows, mac, linux]",0,"[Multi-player, Valve Anti-Cheat enabled]",[Action],"[FPS, World War II, Multiplayer]",0,3416,398,187,34,2500000.0,3.99
3,40,Deathmatch Classic,2001-06-01,True,Valve,Valve,"[windows, mac, linux]",0,"[Multi-player, Online Multi-Player, Local Mult...",[Action],"[Action, FPS, Multiplayer]",0,1273,267,258,184,2500000.0,3.99
4,50,Half-Life: Opposing Force,1999-11-01,True,Gearbox Software,Valve,"[windows, mac, linux]",0,"[Single-player, Multi-player, Valve Anti-Cheat...",[Action],"[FPS, Action, Sci-fi]",0,5250,288,624,415,2500000.0,3.99
5,60,Ricochet,2000-11-01,True,Valve,Valve,"[windows, mac, linux]",0,"[Multi-player, Online Multi-Player, Valve Anti...",[Action],"[Action, FPS, Multiplayer]",0,2758,684,175,10,2500000.0,3.99
6,70,Half-Life,1998-11-08,True,Valve,Valve,"[windows, mac, linux]",0,"[Single-player, Multi-player, Online Multi-Pla...",[Action],"[FPS, Classic, Action]",0,27755,1100,1300,83,2500000.0,7.19
7,80,Counter-Strike: Condition Zero,2004-03-01,True,Valve,Valve,"[windows, mac, linux]",0,"[Single-player, Multi-player, Valve Anti-Cheat...",[Action],"[Action, FPS, Multiplayer]",0,12120,1439,427,43,5000000.0,7.19
8,130,Half-Life: Blue Shift,2001-06-01,True,Gearbox Software,Valve,"[windows, mac, linux]",0,[Single-player],[Action],"[FPS, Action, Sci-fi]",0,3822,420,361,205,2500000.0,3.99
9,220,Half-Life 2,2004-11-16,True,Valve,Valve,"[windows, mac, linux]",0,"[Single-player, Steam Achievements, Steam Trad...",[Action],"[FPS, Action, Sci-fi]",33,67902,2419,691,402,5000000.0,7.19


In [197]:
# check the shape of the dataframe
games.shape

(27075, 18)

In [203]:
# check the data types of the columns
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27075 entries, 0 to 27074
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   appid             27075 non-null  int64         
 1   name              27075 non-null  object        
 2   release_date      27075 non-null  datetime64[ns]
 3   english           27075 non-null  boolean       
 4   developer         27074 non-null  category      
 5   publisher         27061 non-null  category      
 6   platforms         27075 non-null  object        
 7   required_age      27075 non-null  category      
 8   categories        27075 non-null  object        
 9   genres            27074 non-null  object        
 10  steamspy_tags     27075 non-null  object        
 11  achievements      27075 non-null  int64         
 12  positive_ratings  27075 non-null  int64         
 13  negative_ratings  27075 non-null  int64         
 14  average_playtime  2707

In [199]:
# convert datatypes
games['release_date'] = pd.to_datetime(games['release_date'])
games['english'] = games['english'].astype('boolean')
games['required_age'] = games['required_age'].astype('category')
games['developer'] = games['developer'].astype('category')
games['publisher'] = games['publisher'].astype('category')

In [200]:
# split the tags, categories, genres and platforms into lists
games.steamspy_tags = games.steamspy_tags.str.split(',')
games.categories = games.categories.str.split(',')
games.genres = games.genres.str.split(',')
games.platforms = games.platforms.str.split(',')

In [215]:
# convert lists to tuples
games['genres'] = games['genres'].apply(lambda x: tuple(x))
games['categories'] = games['categories'].apply(lambda x: tuple(x))
games['steamspy_tags'] = games['steamspy_tags'].apply(lambda x: tuple(x))
games['platforms'] = games['platforms'].apply(lambda x: tuple(x))

TypeError: 'float' object is not iterable

In [202]:
def calculate_middle_owner(owners_str):
    lower, upper = map(int, owners_str.split("-"))
    return (upper - lower) / 2

games['owners'] = games['owners'].apply(calculate_middle_owner)

In [205]:
games['platforms'].explode().value_counts()

platforms
windows    27070
mac         8066
linux       5235
Name: count, dtype: int64

In [ ]:
# show how many missing values are in each column of the dataframe
games.isnull().sum()

# Francisco

In [ ]:
# calculate the percentage of missing values in the dataframe
missing_values = games.isnull().sum().sort_values(ascending=False)
percentage_missing = (missing_values / games.shape[0]) * 100
percentage_missing

# Francisco

In [ ]:
# calculate the total percentage of missing values across the dataframe
total_missing = (games.isnull().sum().sum() / (games.shape[0] * games.shape[1])) * 100
total_missing

# Francisco

In [ ]:
# find the exact lines in the dataframe that contain missing values
games[games.isnull().any(axis=1)]

# Francisco

In [ ]:
# replace the developer of The Battle of Ages with Green Desert
games.loc[23071, 'developer'] = 'Green Desert'

# Francisco

In [ ]:
# replace the missing publishers with the developer name
games.fillna({'publisher': games['developer']}, inplace=True)

# Francisco

In [37]:
# list all required ages in order from lowest to highest
games['required_age'].sort_values().unique()

# Francisco

array([ 0,  3,  7, 12, 16, 18], dtype=int64)

In [ ]:
games['required_age'].value_counts()

In [ ]:
# check row of data
games.loc[9201,:]

In [216]:
# check the number of unique values in each column
games['genres'].explode().value_counts()

genres
Indie                    19421
Action                   11903
Casual                   10210
Adventure                10032
Strategy                  5247
Simulation                5194
RPG                       4311
Early Access              2954
Free to Play              1704
Sports                    1322
Racing                    1024
Violent                    843
Massively Multiplayer      723
Gore                       537
Utilities                  146
Design & Illustration       87
Animation & Modeling        79
Education                   51
Video Production            38
Software Training           31
Audio Production            29
Web Publishing              28
Game Development            17
Photo Editing               12
Accounting                   6
Documentary                  1
Tutorial                     1
Name: count, dtype: int64

In [207]:
games['categories'].explode().value_counts()

categories
Single-player                 25678
Steam Achievements            14130
Steam Trading Cards            7918
Steam Cloud                    7219
Full controller support        5695
Partial Controller Support     4234
Multi-player                   3974
Steam Leaderboards             3439
Online Multi-Player            2487
Shared/Split Screen            2152
Stats                          1878
Co-op                          1721
Local Multi-Player             1615
Cross-Platform Multiplayer     1081
Online Co-op                   1071
Local Co-op                    1059
Includes level editor          1036
Steam Workshop                  897
Captions available              721
In-App Purchases                690
MMO                             421
VR Support                      231
Commentary available            144
Valve Anti-Cheat enabled         94
Steam Turn Notifications         63
SteamVR Collectibles             40
Includes Source SDK              35
Mods             

In [208]:
games['steamspy_tags'].explode().value_counts()

steamspy_tags
Indie                             16232
Action                            10322
Casual                             8205
Adventure                          7770
Strategy                           4173
                                  ...  
Inventory Management                  1
Parody                                1
Kickstarter                           1
Intentionally Awkward Controls        1
Logic                                 1
Name: count, Length: 337, dtype: int64

In [209]:
games['platforms'].explode().value_counts()

platforms
windows    27070
mac         8066
linux       5235
Name: count, dtype: int64

In [210]:
# plot the number of games per genre
fig = px.bar(games['genres'].explode().value_counts(), x=games['genres'].explode().value_counts().index, y=games['genres'].explode().value_counts().values)
fig.update_layout(title='Number of games per genre', xaxis_title='Genre', yaxis_title='Number of games')
fig.show()


In [211]:
# plot the price distribution, removing the outliers
fig = px.box(games[games.price < 100], y='price')
fig.update_layout(title='Price distribution', yaxis_title='Price')
fig.show()

In [212]:
# find the average playtime per genre
fig = px.bar(games.explode('genres').groupby('genres')['average_playtime'].mean().sort_values(ascending=False), x=games.explode('genres').groupby('genres')['average_playtime'].mean().sort_values(ascending=False).index, y=games.explode('genres').groupby('genres')['average_playtime'].mean().sort_values(ascending=False).values)
fig.update_layout(title='Average playtime per genre', xaxis_title='Genre', yaxis_title='Average playtime')
fig.show()

In [213]:
# find games with lowest playtime
games.sort_values(by='positive_ratings', ascending=False).head(10)


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
25,730,Counter-Strike: Global Offensive,2012-08-21,True,"Valve,Hidden Path Entertainment",Valve,"[windows, mac, linux]",0,"[Multi-player, Steam Achievements, Full contro...","[Action, Free to Play]","[FPS, Multiplayer, Shooter]",167,2644404,402313,22494,6502,25000000.0,0.0
22,570,Dota 2,2013-07-09,True,Valve,Valve,"[windows, mac, linux]",0,"[Multi-player, Co-op, Steam Trading Cards, Ste...","[Action, Free to Play, Strategy]","[Free to Play, MOBA, Strategy]",0,863507,142079,23944,801,50000000.0,0.0
19,440,Team Fortress 2,2007-10-10,True,Valve,Valve,"[windows, mac, linux]",0,"[Multi-player, Cross-Platform Multiplayer, Ste...","[Action, Free to Play]","[Free to Play, Multiplayer, FPS]",520,515879,34036,8495,623,15000000.0,0.0
12836,578080,PLAYERUNKNOWN'S BATTLEGROUNDS,2017-12-21,True,PUBG Corporation,PUBG Corporation,[windows],0,"[Multi-player, Online Multi-Player, Stats]","[Action, Adventure, Massively Multiplayer]","[Survival, Shooter, Multiplayer]",37,496184,487076,22938,12434,25000000.0,26.99
121,4000,Garry's Mod,2006-11-29,True,Facepunch Studios,Valve,"[windows, mac, linux]",0,"[Single-player, Multi-player, Co-op, Cross-Pla...","[Indie, Simulation]","[Sandbox, Multiplayer, Funny]",29,363721,16433,12422,1875,5000000.0,6.99
2478,271590,Grand Theft Auto V,2015-04-13,True,Rockstar North,Rockstar Games,[windows],18,"[Single-player, Multi-player, Steam Achievemen...","[Action, Adventure]","[Open World, Action, Multiplayer]",77,329061,139308,9837,4834,5000000.0,24.99
1467,218620,PAYDAY 2,2013-08-13,True,OVERKILL - a Starbreeze Studio.,Starbreeze Publishing AB,"[windows, linux]",18,"[Single-player, Multi-player, Co-op, Online Co...","[Action, RPG]","[Co-op, Action, FPS]",1130,308657,56523,3975,890,5000000.0,7.49
3362,304930,Unturned,2017-07-07,True,Smartly Dressed Games,Smartly Dressed Games,"[windows, mac, linux]",0,"[Single-player, Online Multi-Player, Online Co...","[Action, Adventure, Casual, Free to Play, Indie]","[Free to Play, Survival, Zombies]",46,292574,31482,3248,413,15000000.0,0.0
1120,105600,Terraria,2011-05-16,True,Re-Logic,Re-Logic,"[windows, mac, linux]",0,"[Single-player, Multi-player, Online Multi-Pla...","[Action, Adventure, Indie, RPG]","[Sandbox, Adventure, Survival]",88,255600,7797,5585,1840,2500000.0,6.99
21,550,Left 4 Dead 2,2009-11-19,True,Valve,Valve,"[windows, mac, linux]",0,"[Single-player, Multi-player, Co-op, Steam Ach...",[Action],"[Zombies, Co-op, FPS]",70,251789,8418,1615,566,5000000.0,7.19


In [214]:
# list the games with the highest average playtime
games.sort_values(by='average_playtime', ascending=True).head(10)


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
13537,599070,Umpire Simulator,2018-02-20,True,Beep2Bleep.com,Beep2Bleep.com,[windows],0,[Single-player],[Simulation],"[Simulation, VR, Indie]",0,3,1,0,0,10000.0,2.09
18046,733840,Anime Bubble Pop,2017-12-02,True,ThinkVirtual LLC,ThinkVirtual LLC,[windows],0,[Single-player],"[Casual, Simulation]","[Casual, Simulation, Anime]",0,7,3,0,0,10000.0,1.69
18045,733800,Frosty Nights,2017-12-08,True,Barry McCabe,Clockwork Wolf,[windows],0,"[Single-player, Full controller support]",[Strategy],"[Strategy, Survival, Horror]",0,41,9,0,0,10000.0,3.99
18043,733770,MegaRace 3,2018-01-10,True,"Cryo Interactive,Jordan Freeman Group",Microids,[windows],0,[Single-player],"[Action, Racing]","[Racing, Action]",0,2,1,0,0,10000.0,2.09
18042,733760,MegaRace 2,2017-11-10,True,"Cryo Interactive,Jordan Freeman Group",Microids,"[windows, mac, linux]",0,[Single-player],"[Action, Racing]","[Racing, Action, Retro]",0,2,0,0,0,10000.0,1.99
18041,733750,Steel Eagle,2018-01-23,True,enrju,Andrzej Cudzilo,[windows],0,"[Single-player, Steam Achievements, Steam Clou...","[Action, Indie]","[Action, Indie, Arcade]",33,1,0,0,0,10000.0,0.79
18040,733740,Sakura Cupid,2018-02-12,True,Winged Cloud,Winged Cloud,"[windows, mac, linux]",0,"[Single-player, Steam Achievements, Partial Co...",[Simulation],[Simulation],16,110,11,0,0,15000.0,7.19
18039,733710,YOUFIGHT,2017-11-02,True,OPENCAP STUDIO,OPENCAP STUDIO,"[windows, mac]",0,"[Multi-player, Online Multi-Player, Local Mult...","[Indie, Early Access]","[Early Access, Indie, Fighting]",0,2,1,0,0,10000.0,3.99
18038,733690,The Hospital: Allison's Diary,2017-11-09,True,KR Games,Star Consult S.r.l.,[windows],0,"[Single-player, Partial Controller Support]","[Adventure, Indie]","[Adventure, Indie, VR]",0,3,9,0,0,10000.0,4.79
18037,733670,Award. Room of fear,2018-03-02,True,Giks,Giks,[windows],0,[Single-player],"[Violent, Action, Indie]","[Action, Indie, Violent]",0,6,2,0,0,10000.0,1.69


In [ ]:
# plot the number of games per category
fig = px.bar(games['categories'].explode().value_counts(), x=games['categories'].explode().value_counts().index, y=games['categories'].explode().value_counts().values)
fig.update_layout(title='Number of games per category', xaxis_title='Category', yaxis_title='Number of games')
fig.show()


In [ ]:
games.sort_values(by='release_date', ascending=False)

In [ ]:
db = client['Steam']
collection = db['Games']
games.reset_index(inplace=True)
data_dict = games.to_dict("records")
# Insert collection
collection.insert_many(data_dict)

In [ ]:


# Normalize the owners, positive_ratings, and negative_ratings columns using StandardScaler
scaler = StandardScaler()
games[['middle_owners', 'positive_ratings', 'negative_ratings']] = scaler.fit_transform(games[['middle_owners', 'positive_ratings', 'negative_ratings']])

In [ ]:

# display the normalized values in a heatmap
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(games[['owners', 'positive_ratings', 'negative_ratings']].corr(), annot=True, ax=ax)
plt.show()


In [ ]:
# table of all columns with thier data type
data_types = pd.DataFrame(games.dtypes, columns=['Data Type']).reset_index()
data_types.columns = ['Column Name', 'Data Type']
data_types

In [ ]:
# get the standard stats for each column
games.describe()

In [229]:
# pie chart of english vs non english games
fig = px.pie(games, names='english', title='English vs Non-English Games')
fig.show()
