In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

In [1]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://root:root@cluster0.qyyrcuj.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [166]:
# read data from csv
games = pd.read_csv('data/steam.csv')

In [167]:
# list first 10 rows to check data imported correctly
games.head(10)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,"windows,mac,linux",0,"Multi-player,Online Multi-Player,Local Multi-P...",Action,"Action,FPS,Multiplayer",0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,"windows,mac,linux",0,"Multi-player,Online Multi-Player,Local Multi-P...",Action,"Action,FPS,Multiplayer",0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,"windows,mac,linux",0,"Multi-player,Valve Anti-Cheat enabled",Action,"FPS,World War II,Multiplayer",0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,"windows,mac,linux",0,"Multi-player,Online Multi-Player,Local Multi-P...",Action,"Action,FPS,Multiplayer",0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,"windows,mac,linux",0,"Single-player,Multi-player,Valve Anti-Cheat en...",Action,"FPS,Action,Sci-fi",0,5250,288,624,415,5000000-10000000,3.99
5,60,Ricochet,2000-11-01,1,Valve,Valve,"windows,mac,linux",0,"Multi-player,Online Multi-Player,Valve Anti-Ch...",Action,"Action,FPS,Multiplayer",0,2758,684,175,10,5000000-10000000,3.99
6,70,Half-Life,1998-11-08,1,Valve,Valve,"windows,mac,linux",0,"Single-player,Multi-player,Online Multi-Player...",Action,"FPS,Classic,Action",0,27755,1100,1300,83,5000000-10000000,7.19
7,80,Counter-Strike: Condition Zero,2004-03-01,1,Valve,Valve,"windows,mac,linux",0,"Single-player,Multi-player,Valve Anti-Cheat en...",Action,"Action,FPS,Multiplayer",0,12120,1439,427,43,10000000-20000000,7.19
8,130,Half-Life: Blue Shift,2001-06-01,1,Gearbox Software,Valve,"windows,mac,linux",0,Single-player,Action,"FPS,Action,Sci-fi",0,3822,420,361,205,5000000-10000000,3.99
9,220,Half-Life 2,2004-11-16,1,Valve,Valve,"windows,mac,linux",0,"Single-player,Steam Achievements,Steam Trading...",Action,"FPS,Action,Sci-fi",33,67902,2419,691,402,10000000-20000000,7.19


In [168]:
# check the shape of the dataframe
games.shape

(27075, 18)

In [169]:
# check the data types of the columns
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27075 entries, 0 to 27074
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   appid             27075 non-null  int64  
 1   name              27075 non-null  object 
 2   release_date      27075 non-null  object 
 3   english           27075 non-null  int64  
 4   developer         27074 non-null  object 
 5   publisher         27061 non-null  object 
 6   platforms         27075 non-null  object 
 7   required_age      27075 non-null  int64  
 8   categories        27075 non-null  object 
 9   genres            27074 non-null  object 
 10  steamspy_tags     27075 non-null  object 
 11  achievements      27075 non-null  int64  
 12  positive_ratings  27075 non-null  int64  
 13  negative_ratings  27075 non-null  int64  
 14  average_playtime  27075 non-null  int64  
 15  median_playtime   27075 non-null  int64  
 16  owners            27075 non-null  object

In [170]:
# convert datatypes
games['release_date'] = pd.to_datetime(games['release_date'])
games['english'] = games['english'].astype('boolean')
games['required_age'] = games['required_age'].astype('category')
games['developer'] = games['developer'].astype('category')
games['publisher'] = games['publisher'].astype('category')

In [171]:
# split the tags, categories, genres and platforms into lists
games.steamspy_tags = games.steamspy_tags.str.split(',')
games.categories = games.categories.str.split(',')
games.genres = games.genres.str.split(',')
games.platforms = games.platforms.str.split(',')

In [172]:
# convert lists to tuples
games['genres'] = games['genres'].apply(lambda x: tuple(x))
games['categories'] = games['categories'].apply(lambda x: tuple(x))
games['steamspy_tags'] = games['steamspy_tags'].apply(lambda x: tuple(x))
games['platforms'] = games['platforms'].apply(lambda x: tuple(x))

TypeError: 'float' object is not iterable

In [173]:
def calculate_middle_owner(owners_str):
    lower, upper = map(int, owners_str.split("-"))
    return (upper - lower) / 2

games['owners'] = games['owners'].apply(calculate_middle_owner)

In [ ]:
games['platforms'].explode().value_counts()

In [ ]:
# show how many missing values are in each column of the dataframe
games.isnull().sum()

# Francisco

In [ ]:
# calculate the percentage of missing values in the dataframe
missing_values = games.isnull().sum().sort_values(ascending=False)
percentage_missing = (missing_values / games.shape[0]) * 100
percentage_missing

# Francisco

In [ ]:
# calculate the total percentage of missing values across the dataframe
total_missing = (games.isnull().sum().sum() / (games.shape[0] * games.shape[1])) * 100
total_missing

# Francisco

In [ ]:
# find the exact lines in the dataframe that contain missing values
games[games.isnull().any(axis=1)]

# Francisco

In [ ]:
# replace the developer of The Battle of Ages with Green Desert
games.loc[23071, 'developer'] = 'Green Desert'

# Francisco

In [ ]:
# replace the missing publishers with the developer name
games.fillna({'publisher': games['developer']}, inplace=True)

# Francisco

In [ ]:
games['required_age'].value_counts()

In [ ]:
# check row of data
games.loc[9201,:]

In [ ]:
# check the number of unique values in each column
games['genres'].explode().value_counts()

In [ ]:
games['categories'].explode().value_counts()

In [ ]:
games['steamspy_tags'].explode().value_counts()

In [ ]:
games['platforms'].explode().value_counts()

In [ ]:
# plot the number of games per genre
fig = px.bar(games['genres'].explode().value_counts(), x=games['genres'].explode().value_counts().index, y=games['genres'].explode().value_counts().values)
fig.update_layout(title='Number of games per genre', xaxis_title='Genre', yaxis_title='Number of games')
fig.show()


In [ ]:
# plot the price distribution, removing the outliers
fig = px.box(games[games.price < 100], y='price')
fig.update_layout(title='Price distribution', yaxis_title='Price')
fig.show()

In [ ]:
# find the average playtime per genre
fig = px.bar(games.explode('genres').groupby('genres')['average_playtime'].mean().sort_values(ascending=False), x=games.explode('genres').groupby('genres')['average_playtime'].mean().sort_values(ascending=False).index, y=games.explode('genres').groupby('genres')['average_playtime'].mean().sort_values(ascending=False).values)
fig.update_layout(title='Average playtime per genre', xaxis_title='Genre', yaxis_title='Average playtime')
fig.show()

In [ ]:
# find games with lowest playtime
games.sort_values(by='positive_ratings', ascending=False).head(10)


In [ ]:
# list the games with the highest average playtime
games.sort_values(by='average_playtime', ascending=True).head(10)


In [ ]:
# plot the number of games per category
fig = px.bar(games['categories'].explode().value_counts(), x=games['categories'].explode().value_counts().index, y=games['categories'].explode().value_counts().values)
fig.update_layout(title='Number of games per category', xaxis_title='Category', yaxis_title='Number of games')
fig.show()


In [ ]:
games.sort_values(by='release_date', ascending=False)

In [ ]:
db = client['Steam']
collection = db['Games']
games.reset_index(inplace=True)
data_dict = games.to_dict("records")
# Insert collection
collection.insert_many(data_dict)

In [ ]:


# Normalize the owners, positive_ratings, and negative_ratings columns using StandardScaler
scaler = StandardScaler()
games[['middle_owners', 'positive_ratings', 'negative_ratings']] = scaler.fit_transform(games[['middle_owners', 'positive_ratings', 'negative_ratings']])

In [ ]:

# display the normalized values in a heatmap
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(games[['owners', 'positive_ratings', 'negative_ratings']].corr(), annot=True, ax=ax)
plt.show()


In [ ]:
# table of all columns with thier data type
data_types = pd.DataFrame(games.dtypes, columns=['Data Type']).reset_index()
data_types.columns = ['Column Name', 'Data Type']
data_types

In [ ]:
# get the standard stats for each column
games.describe()