# Import the libraries that are necesaries

In [24]:
import pandas as pd
import os
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

### Read the CSVs

In [25]:
# Specify the directory containing the CSV files
directory_path = '../data/clean/'

# Specify the order of CSV files
csv_order = ['game.csv', 'languages.csv', 'platform.csv', 'metacritic.csv', 'user_feedback.csv',
             'playtime.csv', 'development.csv', 'categorization.csv', 'media.csv']

# Initialize an empty DataFrame to store the merged result
game_data = pd.DataFrame()

# Loop through each CSV file, read it into a DataFrame, and merge it with the existing DataFrame
for csv_file in csv_order:
    file_path = os.path.join(directory_path, csv_file)
    df = pd.read_csv(file_path)

    # Merge based on the 'appid' column
    if game_data.empty:
        game_data = df
    else:
        game_data = pd.merge(game_data, df, on='appid', how='outer')

In [26]:
game_data

Unnamed: 0,appid,name,release_date,estimated_owners,peak_ccu,required_age,price,dlc_count,about_the_game,reviews,...,average_playtime_two_weeks,median_playtime_forever,median_playtime_two_weeks,developers,publishers,categories,genres,tags,screenshots,movies
0,20200,Galactic Bowling,2008-10-21,10000.0,0.0,0.0,19.99,0.0,Galactic Bowling is an exaggerated and stylize...,unknown,...,0.0,0.0,0.0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,2017-10-12,10000.0,0.0,0.0,0.99,0.0,THE LAW!! Looks to be a showdown atop a train....,unknown,...,0.0,0.0,0.0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,2021-11-17,10000.0,0.0,0.0,4.99,0.0,Jolt Project: The army now has a new robotics ...,unknown,...,0.0,0.0,0.0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",unknown,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,2020-07-23,10000.0,0.0,0.0,5.99,0.0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,unknown,...,0.0,0.0,0.0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,2020-02-03,10000.0,0.0,0.0,0.00,0.0,ABOUT THE GAME Play as a hacker who has arrang...,unknown,...,0.0,0.0,0.0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85098,2669080,Mannerheim's Saloon Car,2024-01-02,0.0,0.0,0.0,0.00,0.0,Marshal Mannerheim’s Saloon Car is the train c...,unknown,...,0.0,0.0,0.0,Xamk Game Studios,"Sodan ja rauhan keskus Muisti, Päämajamuseo","Single-player,Tracked Controller Support,VR Only","Adventure,Simulation",unknown,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
85099,2736910,Beer Run,2024-01-03,0.0,0.0,0.0,0.00,0.0,Beer Run is an Indie game created to steal bee...,unknown,...,0.0,0.0,0.0,955 Games,955 Games,Single-player,"Casual,Indie",unknown,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
85100,2743220,My Friend The Spider,2024-01-04,0.0,0.0,0.0,0.00,0.0,A small 'horror' narrative game about isolatio...,unknown,...,0.0,0.0,0.0,MCA,MCA,Single-player,"Adventure,Simulation",unknown,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
85101,2293130,Path of Survivors,2024-01-08,0.0,0.0,0.0,3.99,0.0,Path of Survivors is a multi-class auto-battle...,unknown,...,0.0,0.0,0.0,Limited Input,Limited Input,"Single-player,Steam Achievements,Partial Contr...","Action,Casual,Indie,RPG,Simulation",unknown,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


Normalize the dataset

In [29]:
# Columns to convert to lists
columns_to_convert = ['supported_languages', 'full_audio_languages', 'categories', 'genres', 'tags']

# Create a copy of the original DataFrame
df_normalized = game_data.copy()

# Fill NaN values with an empty string
df_normalized[columns_to_convert] = df_normalized[columns_to_convert].fillna('')

# Normalize numerical features using StandardScaler
numerical_columns = df_normalized.select_dtypes(include=['float64', 'int64']).columns.difference(['appid'])
scaler = StandardScaler()
df_normalized[numerical_columns] = scaler.fit_transform(df_normalized[numerical_columns])

# One-hot encode categorical features
categorical_columns = df_normalized.select_dtypes(include=['bool']).columns.difference(['appid'])
df_normalized = pd.get_dummies(df_normalized, columns=categorical_columns, drop_first=True)

# Use CountVectorizer to convert text columns to bag-of-words representation
count_vectorizer = CountVectorizer()
for column in columns_to_convert:
    column_bow = count_vectorizer.fit_transform(df_normalized[column])
    df_normalized = pd.concat([df_normalized, pd.DataFrame(column_bow.toarray(), columns=count_vectorizer.get_feature_names_out([column]))], axis=1)
    df_normalized.drop(column, axis=1, inplace=True)

# Convert 'appid' column to integers
df_normalized['appid'] = df_normalized['appid'].astype(int)

df_normalized

Unnamed: 0,appid,name,release_date,estimated_owners,peak_ccu,required_age,price,dlc_count,about_the_game,reviews,...,western,wholesome,with,word,workshop,world,wrestling,written,your,zombies
0,20200,Galactic Bowling,2008-10-21,-0.073843,-0.02496,-0.13872,1.035098,-0.039604,Galactic Bowling is an exaggerated and stylize...,unknown,...,0,0,0,0,0,0,0,0,0,0
1,655370,Train Bandit,2017-10-12,-0.073843,-0.02496,-0.13872,-0.501820,-0.039604,THE LAW!! Looks to be a showdown atop a train....,unknown,...,1,0,0,0,0,0,0,0,0,0
2,1732930,Jolt Project,2021-11-17,-0.073843,-0.02496,-0.13872,-0.178258,-0.039604,Jolt Project: The army now has a new robotics ...,unknown,...,0,0,0,0,0,0,0,0,0,0
3,1355720,Henosis™,2020-07-23,-0.073843,-0.02496,-0.13872,-0.097368,-0.039604,HENOSIS™ is a mysterious 2D Platform Puzzler w...,unknown,...,0,0,0,0,0,0,0,0,0,0
4,1139950,Two Weeks in Painland,2020-02-03,-0.073843,-0.02496,-0.13872,-0.581902,-0.039604,ABOUT THE GAME Play as a hacker who has arrang...,unknown,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85098,2669080,Mannerheim's Saloon Car,2024-01-02,-0.083573,-0.02496,-0.13872,-0.581902,-0.039604,Marshal Mannerheim’s Saloon Car is the train c...,unknown,...,0,0,0,0,0,0,0,0,0,0
85099,2736910,Beer Run,2024-01-03,-0.083573,-0.02496,-0.13872,-0.581902,-0.039604,Beer Run is an Indie game created to steal bee...,unknown,...,0,0,0,0,0,0,0,0,0,0
85100,2743220,My Friend The Spider,2024-01-04,-0.083573,-0.02496,-0.13872,-0.581902,-0.039604,A small 'horror' narrative game about isolatio...,unknown,...,0,0,0,0,0,0,0,0,0,0
85101,2293130,Path of Survivors,2024-01-08,-0.083573,-0.02496,-0.13872,-0.259149,-0.039604,Path of Survivors is a multi-class auto-battle...,unknown,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df_normalized.columns

Index(['appid', 'name', 'release_date', 'estimated_owners', 'peak_ccu',
       'required_age', 'price', 'dlc_count', 'about_the_game', 'reviews',
       ...
       'western', 'wholesome', 'with', 'word', 'workshop', 'world',
       'wrestling', 'written', 'your', 'zombies'],
      dtype='object', length=886)