In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

In [None]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = ""

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
# read data from csv
games = pd.read_csv('data/steam.csv')

In [None]:
# list first 10 rows to check data imported correctly
games.head(10)

In [None]:
# check the shape of the dataframe
games.shape

In [None]:
# check the data types of the columns
games.info()

In [None]:
# convert datatypes
games['release_date'] = pd.to_datetime(games['release_date'])
games['english'] = games['english'].astype('boolean')
games['required_age'] = games['required_age'].astype('category')
games['developer'] = games['developer'].astype('category')
games['publisher'] = games['publisher'].astype('category')

In [None]:
# split the tags, categories, genres and platforms into lists
games.steamspy_tags = games.steamspy_tags.str.split(',')
games.categories = games.categories.str.split(',')
games.genres = games.genres.str.split(',')
games.platforms = games.platforms.str.split(',')

In [None]:
# convert lists to tuples
games['genres'] = games['genres'].apply(lambda x: tuple(x))
games['categories'] = games['categories'].apply(lambda x: tuple(x))
games['steamspy_tags'] = games['steamspy_tags'].apply(lambda x: tuple(x))
games['platforms'] = games['platforms'].apply(lambda x: tuple(x))

In [None]:
def calculate_middle_owner(owners_str):
    lower, upper = map(int, owners_str.split("-"))
    return (upper - lower) / 2

games['owners'] = games['owners'].apply(calculate_middle_owner)

In [12]:
games['platforms'].explode().value_counts()

platforms
windows    27070
mac         8066
linux       5235
Name: count, dtype: int64

In [ ]:
# show how many missing values are in each column of the dataframe
games.isnull().sum()

# Francisco

In [ ]:
# calculate the percentage of missing values in the dataframe
missing_values = games.isnull().sum().sort_values(ascending=False)
percentage_missing = (missing_values / games.shape[0]) * 100
percentage_missing

# Francisco

In [ ]:
# calculate the total percentage of missing values across the dataframe
total_missing = (games.isnull().sum().sum() / (games.shape[0] * games.shape[1])) * 100
total_missing

# Francisco

In [ ]:
# find the exact lines in the dataframe that contain missing values
games[games.isnull().any(axis=1)]

# Francisco

In [ ]:
# replace the developer of The Battle of Ages with Green Desert
games.loc[23071, 'developer'] = 'Green Desert'

# Francisco

In [ ]:
# replace the missing publishers with the developer name
games.fillna({'publisher': games['developer']}, inplace=True)

# Francisco

In [None]:
# list all required ages in order from lowest to highest
games['required_age'].sort_values().unique()

# Francisco

In [ ]:
games['required_age'].value_counts()

In [ ]:
# check row of data
games.loc[9201,:]

In [None]:
# check the number of unique values in each column
games['genres'].explode().value_counts()

In [None]:
games['categories'].explode().value_counts()

In [None]:
games['steamspy_tags'].explode().value_counts()

In [None]:
games['platforms'].explode().value_counts()

In [None]:
# plot the number of games per genre
fig = px.bar(games['genres'].explode().value_counts(), x=games['genres'].explode().value_counts().index, y=games['genres'].explode().value_counts().values)
fig.update_layout(title='Number of games per genre', xaxis_title='Genre', yaxis_title='Number of games')
fig.show()


In [None]:
# plot the price distribution, removing the outliers
fig = px.box(games[games.price < 100], y='price')
fig.update_layout(title='Price distribution', yaxis_title='Price')
fig.show()

In [None]:
# find the average playtime per genre
fig = px.bar(games.explode('genres').groupby('genres')['average_playtime'].mean().sort_values(ascending=False), x=games.explode('genres').groupby('genres')['average_playtime'].mean().sort_values(ascending=False).index, y=games.explode('genres').groupby('genres')['average_playtime'].mean().sort_values(ascending=False).values)
fig.update_layout(title='Average playtime per genre', xaxis_title='Genre', yaxis_title='Average playtime')
fig.show()

In [None]:
# find games with lowest playtime
games.sort_values(by='positive_ratings', ascending=False).head(10)


In [None]:
# list the games with the highest average playtime
games.sort_values(by='average_playtime', ascending=True).head(10)


In [ ]:
# plot the number of games per category
fig = px.bar(games['categories'].explode().value_counts(), x=games['categories'].explode().value_counts().index, y=games['categories'].explode().value_counts().values)
fig.update_layout(title='Number of games per category', xaxis_title='Category', yaxis_title='Number of games')
fig.show()


In [ ]:
games.sort_values(by='release_date', ascending=False)

In [ ]:
db = client['Steam']
collection = db['Games']
games.reset_index(inplace=True)
data_dict = games.to_dict("records")
# Insert collection
collection.insert_many(data_dict)

In [ ]:


# Normalize the owners, positive_ratings, and negative_ratings columns using StandardScaler
scaler = StandardScaler()
games[['middle_owners', 'positive_ratings', 'negative_ratings']] = scaler.fit_transform(games[['middle_owners', 'positive_ratings', 'negative_ratings']])

In [ ]:

# display the normalized values in a heatmap
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(games[['owners', 'positive_ratings', 'negative_ratings']].corr(), annot=True, ax=ax)
plt.show()


In [ ]:
# table of all columns with thier data type
data_types = pd.DataFrame(games.dtypes, columns=['Data Type']).reset_index()
data_types.columns = ['Column Name', 'Data Type']
data_types

In [ ]:
# get the standard stats for each column
games.describe()

In [18]:
# pie chart of english vs non english games
fig = px.pie(games, names='english', title='English vs Non-English Games')
fig.show()


In [27]:
# pie chart of the number of games on each platform
fig = px.pie(games, names=games['platforms'].explode(), title='Number of games on each platform')
fig.show()