# Preprocessing 

## Importing Necessary Libraries

In [1]:
#!pip install plotly

In [35]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from collections import Counter
import re
import plotly.figure_factory as ff
import plotly.express as px

## Loading Data

In [3]:
df = pd.read_csv('games_info.csv')
game_details = pd.read_csv('game_details.csv')

## Initial Data Exploration

In [4]:
df.head()

Unnamed: 0,NAME,STORE_GENRE,RATING_SCORE,N_SUPPORTED_LANGUAGES,DEVELOPERS,SUPPORTED_PLATFORMS,POSITIVE_REVIEWS,NEGATIVE_REVIEWS,TECHNOLOGIES,RELEASE_DATE,TOTAL_TWITCH_PEAK,PRICE,N_DLC,24_HOUR_PEAK
0,Dragon's Dogma: Dark Arisen,"Store Genres Action (1), Adventure (25), RPG (3)",review_score 8,7 Languages,Capcom,Windows,39271,5015,,15 January 2016 – 16:48:00 UTC (9 years ago),"27,368\nall-time peak 9 years ago17 January 2016",$22.49,3.0,"1,064\n24-hour peak"
1,Forza Horizon 5,"Store Genres Action (1), Adventure (25), Racin...",review_score 8,24 Languages,Playground Games,Windows,156683,21075,,9 November 2021 – 04:58:00 UTC (3 years ago),"81,096\nall-time peak 3 years ago13 November 2021",$32.78,53.0,"16,131\n24-hour peak"
2,Thrive,"Store Genres Casual (4), Indie (23), Simulatio...",review_score 8,1 Languages,Revolutionary Games Studio,Windows Linux,872,70,Engine.Godot,26 November 2021 – 10:54:48 UTC (3 years ago),145\nall-time peak 2 years ago24 July 2022,$2.99,,21\n24-hour peak
3,Layers of Fear (2016),"Store Genres Adventure (25), Indie (23)",review_score 8,12 Languages,Bloober Team SA,Windows macOS Linux,22766,2819,Engine.Unity,15 February 2016 – 21:53:00 UTC (8 years ago),"145,384\nall-time peak 6 years ago13 June 2018",$10.19,3.0,31\n24-hour peak
4,TaskPals,"Store Genres Indie (23), Free to Play (37)",review_score 8,1 Languages,lazarche,Windows,834,66,"Detected Technologies (?), SDK.SteamworksNET",25 May 2023 – 10:22:41 UTC (12 months ago),"3,778\nall-time peak 10 months ago18 July 2023",,4.0,318\n24-hour peak


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2259 entries, 0 to 2258
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   NAME                   2259 non-null   object 
 1   STORE_GENRE            2221 non-null   object 
 2   RATING_SCORE           2225 non-null   object 
 3   N_SUPPORTED_LANGUAGES  2226 non-null   object 
 4   DEVELOPERS             2258 non-null   object 
 5   SUPPORTED_PLATFORMS    2197 non-null   object 
 6   POSITIVE_REVIEWS       2259 non-null   int64  
 7   NEGATIVE_REVIEWS       2259 non-null   int64  
 8   TECHNOLOGIES           1845 non-null   object 
 9   RELEASE_DATE           2255 non-null   object 
 10  TOTAL_TWITCH_PEAK      2259 non-null   object 
 11  PRICE                  1507 non-null   object 
 12  N_DLC                  789 non-null    float64
 13  24_HOUR_PEAK           2208 non-null   object 
dtypes: float64(1), int64(2), object(11)
memory usage: 247.2+

In [6]:
df.describe()

Unnamed: 0,POSITIVE_REVIEWS,NEGATIVE_REVIEWS,N_DLC
count,2259.0,2259.0,789.0
mean,20878.47,1270.529438,5.904943
std,75400.8,5824.763668,17.345104
min,34.0,0.0,1.0
25%,724.5,33.0,1.0
50%,2509.0,136.0,2.0
75%,10431.0,602.0,4.0
max,1257272.0,190953.0,379.0


In [7]:
df.describe(include='object')

Unnamed: 0,NAME,STORE_GENRE,RATING_SCORE,N_SUPPORTED_LANGUAGES,DEVELOPERS,SUPPORTED_PLATFORMS,TECHNOLOGIES,RELEASE_DATE,TOTAL_TWITCH_PEAK,PRICE,24_HOUR_PEAK
count,2259,2221,2225,2226,2258,2197,1845,2255,2259,1507,2208
unique,2088,325,3,36,1668,6,234,2071,2066,125,643
top,TOEM,"Store Genres Action (1), Adventure (25), Indie...",review_score 8,1 Languages,Square Enix,Windows,Engine.Unity,17 September 2021 – 13:00:40 UTC (3 years ago),2 years ago,$19.99,4\n24-hour peak
freq,4,113,1355,583,16,1294,764,4,8,206,81


In [8]:
game_details.head()

Unnamed: 0,name,price,release
0,Portal 2,$8.23,2011
1,People Playground,$9.99,2019
2,Stardew Valley,$14.99,2016
3,Vampire Survivors,$4.99,2022
4,Hades,$24.99,2020


In [9]:
game_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2950 entries, 0 to 2949
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     2950 non-null   object
 1   price    2950 non-null   object
 2   release  2950 non-null   object
dtypes: object(3)
memory usage: 69.3+ KB


In [10]:
game_details.describe(include='object')

Unnamed: 0,name,price,release
count,2950,2950,2950
unique,2562,170,29
top,Walking Simulator,-,2023
freq,2,543,522


## Removing Duplicated Games

In [11]:
df_copy = df.copy()
duplicate_rows = df[df.duplicated(subset=['NAME'])]
print("Duplicate Rows based on the 'NAME' column:")
duplicate_rows
num_duplicates = duplicate_rows.shape[0]
print(f"Number of duplicate rows based on the 'NAME' column: {num_duplicates}")
df = df_copy.drop_duplicates(subset=['NAME'], keep='first')

# df_copy = df.copy()
# for col in df_copy.columns:
#     if df_copy[col].apply(lambda x: isinstance(x, list)).any():
#         df_copy[col] = df_copy[col].apply(tuple)
# duplicate_rows = df_copy[df_copy.duplicated()]
# num_duplicates = duplicate_rows.shape[0]
# print(f"Number of duplicate rows: {num_duplicates}")
# df = df_copy.drop_duplicates(keep='first')
# duplicate_rows

Duplicate Rows based on the 'NAME' column:
Number of duplicate rows based on the 'NAME' column: 171


In [12]:
duplicate_rows.head()

Unnamed: 0,NAME,STORE_GENRE,RATING_SCORE,N_SUPPORTED_LANGUAGES,DEVELOPERS,SUPPORTED_PLATFORMS,POSITIVE_REVIEWS,NEGATIVE_REVIEWS,TECHNOLOGIES,RELEASE_DATE,TOTAL_TWITCH_PEAK,PRICE,N_DLC,24_HOUR_PEAK
61,The Night of the Scissors,"Store Genres Action (1), Adventure (25), Indie...",review_score 8,10 Languages,Tomás Esconjaureguy,Windows macOS Linux,343,14,Engine.Unity,13 May 2022 – 15:20:15 UTC (2 years ago),11\nall-time peak 2 years ago14 May 2022,$2.99,,2\n24-hour peak
132,If My Heart Had Wings -Flight Diary-,"Store Genres Adventure (25), Casual (4), Simul...",review_score 8,3 Languages,MoeNovel,Windows,452,23,,28 February 2019 – 01:56:15 UTC (5 years ago),83\nall-time peak 5 years ago28 February 2019,,1.0,2\n24-hour peak
133,KNIGHTS,"Store Genres Casual (4), Indie (23), Strategy (2)",review_score 8,29 Languages,Arzola's,Windows macOS Linux,701,43,Engine.Unity,30 May 2016 – 16:50:50 UTC (8 years ago),653\nall-time peak 16 months ago16 February 2023,,,1\n24-hour peak
139,Command & Conquer: Red Alert™ 2 and Yuri’s Rev...,"Store Genres Action (1), RPG (3), Strategy (2)",review_score 8,5 Languages,EA Los Angeles,Windows,3651,333,,7 March 2024 – 18:50:00 UTC (3 months ago),"6,617\nall-time peak 3 months ago10 March 2024",$19.88,,"1,245\n24-hour peak"
156,Metro Exodus,Store Genres Action (1),review_score 8,15 Languages,4A Games,Windows macOS Linux,94019,11112,Engine.4A_Engine,15 February 2020 – 05:00:00 UTC (4 years ago),"15,375\nall-time peak 4 years ago16 February 2020",,6.0,"1,203\n24-hour peak"


In [13]:
del df_copy
del duplicate_rows

## Merging Game Information and Price DataFrames

In [14]:
for index, row in df[df['PRICE'].isnull()].iterrows():
    game_name = row['NAME']
    price = game_details[game_details['name'] == game_name]['price'].values
    
    if (len(price) > 0):
        df.at[index, 'PRICE'] = price[0]

In [15]:
del game_details

## Handling Missing Values

In [16]:
df.replace('N/A', pd.NA, inplace=True)

In [17]:
def missing_percentage_table(df):
    missing_percentage = df.isnull().mean() * 100
    missing_percentage_table = pd.DataFrame({'Column': missing_percentage.index, 'Missing Percentage': missing_percentage.values})
    missing_percentage_table = missing_percentage_table.sort_values(by='Missing Percentage', ascending=False)
    return missing_percentage_table

missing_percentage_table = missing_percentage_table(df)

In [18]:
missing_percentage_table

Unnamed: 0,Column,Missing Percentage
12,N_DLC,65.038314
8,TECHNOLOGIES,18.582375
11,PRICE,12.068966
5,SUPPORTED_PLATFORMS,2.729885
13,24_HOUR_PEAK,2.250958
1,STORE_GENRE,1.724138
2,RATING_SCORE,1.532567
3,N_SUPPORTED_LANGUAGES,1.484674
9,RELEASE_DATE,0.191571
4,DEVELOPERS,0.047893


## Cleaning `N_DLC` Column

In [19]:
df['N_DLC'] = pd.to_numeric(df['N_DLC'], errors='coerce') 
df['N_DLC'] = df['N_DLC'].fillna(-1)  
df['N_DLC'] = df['N_DLC'].astype(int) 
df['N_DLC'] = df['N_DLC'].replace(-1, pd.NA)
df['N_DLC']

0          3
1         53
2       <NA>
3          3
4          4
        ... 
2253    <NA>
2254    <NA>
2256    <NA>
2257    <NA>
2258       1
Name: N_DLC, Length: 2088, dtype: object

In [20]:
df.drop(columns=['N_DLC'], inplace=True)

## Extracting and Handling 'PUBLISH_YEAR' from 'RELEASE_DATE'

This section extracts the year from the 'RELEASE_DATE' column to create a new 'PUBLISH_YEAR' column.
It replaces null values in 'PUBLISH_YEAR' with the median valueand missing data by converting them to a consistent format.

In [21]:
if 'RELEASE_DATE' in df.columns:
    df['PUBLISH_YEAR'] = df['RELEASE_DATE'].str.extract(r'(\d{4})')

    df['PUBLISH_YEAR'] = pd.to_numeric(df['PUBLISH_YEAR'], errors='coerce')

    df['PUBLISH_YEAR'] = df['PUBLISH_YEAR'].fillna(-1)
    df['PUBLISH_YEAR'] = df['PUBLISH_YEAR'].astype(int)
    df['PUBLISH_YEAR'] = df['PUBLISH_YEAR'].replace(-1, pd.NA)

    df = df.drop('RELEASE_DATE', axis=1)

### Replace null values in 'PUBLISH_YEAR' with the median of the column

In [22]:
null_count_before = df['PUBLISH_YEAR'].isnull().sum()
null_count_before

4

In [23]:
median_publish_year = df['PUBLISH_YEAR'].median()
df['PUBLISH_YEAR'] = df['PUBLISH_YEAR'].fillna(median_publish_year)

  df['PUBLISH_YEAR'] = df['PUBLISH_YEAR'].fillna(median_publish_year)


In [24]:
null_count_after = df['PUBLISH_YEAR'].isnull().sum()
null_count_after

0

In [25]:
df['PUBLISH_YEAR'] = df['PUBLISH_YEAR'].astype(int)

## Removing Rows with Missing 'DEVELOPERS' Data

In [26]:
df['DEVELOPERS']

0                           Capcom
1                 Playground Games
2       Revolutionary Games Studio
3                  Bloober Team SA
4                         lazarche
                   ...            
2253                       Eggcode
2254                    Oslo Albet
2256                      TRAGsoft
2257           Gray Matter Studios
2258               League of Geeks
Name: DEVELOPERS, Length: 2088, dtype: object

In [27]:
df.dropna(subset=['DEVELOPERS'], inplace=True)

## Cleaning and Converting 'N_SUPPORTED_LANGUAGES' Data

This section handles the `N_SUPPORTED_LANGUAGES` column by filling missing values with a default number (-1) and converting the column to an integer type after removing any extraneous text.

In [28]:
df['N_SUPPORTED_LANGUAGES'] = df['N_SUPPORTED_LANGUAGES'].fillna(1)

In [29]:
df['N_SUPPORTED_LANGUAGES'] = df['N_SUPPORTED_LANGUAGES'].astype(str).str.replace(' Languages', '').astype(int)

In [30]:
df['N_SUPPORTED_LANGUAGES'].isnull().mean() * 100

0.0

In [31]:
placeholder_value = 'review_score -1'
df['RATING_SCORE'] = df['RATING_SCORE'].fillna(placeholder_value)

In [32]:
df['RATING_SCORE'] = df['RATING_SCORE'].str.replace('review_score ', '').astype(float).astype(int)  

In [33]:
mean_rating_score = df.loc[df['RATING_SCORE'] != -1, 'RATING_SCORE'].mean()  
df.loc[df['RATING_SCORE'] == -1, 'RATING_SCORE'] = mean_rating_score

  df.loc[df['RATING_SCORE'] == -1, 'RATING_SCORE'] = mean_rating_score


## One-Hot Encoding 'SUPPORTED_PLATFORMS' Column

This section transforms the 'SUPPORTED_PLATFORMS' column into separate binary columns for each platform using one-hot encoding, which facilitates easier analysis and modeling.

In [36]:
all_platforms = df['SUPPORTED_PLATFORMS'].dropna().str.split().sum()
platform_counts = Counter(all_platforms)
most_common_platform = platform_counts.most_common(1)[0][0]

most_common_platform

'Windows'

In [38]:
df.loc[df['SUPPORTED_PLATFORMS'].isnull(), 'SUPPORTED_PLATFORMS'] = most_common_platform

In [39]:
# Split the 'SUPPORTED_PLATFORMS' column into separate columns for each platform
platform_dummies = df['SUPPORTED_PLATFORMS'].str.get_dummies(sep=' ')

# Concatenate the dummy columns with the original DataFrame
df = pd.concat([df, platform_dummies], axis=1)

In [40]:
df.drop(columns=['SUPPORTED_PLATFORMS'], inplace=True)

## Cleaning 'STORE_GENRE' Column

This section focuses on cleaning the 'STORE_GENRE' column. It fills missing values with a placeholder, removes unnecessary text, and splits the genre strings for better usability.

In [41]:
def clean_store_genre(genre_string):
    genres = genre_string.split(', ')
    cleaned_genres = []
    for genre in genres:
        cleaned_genre = genre.split(' (')[0]
        cleaned_genres.append(cleaned_genre)
    return cleaned_genres

In [42]:
df['STORE_GENRE'] = df['STORE_GENRE'].fillna('Store Genres')
df['STORE_GENRE'] = df['STORE_GENRE'].str.replace('Store Genres', '')
df['STORE_GENRE'] = df['STORE_GENRE'].apply(clean_store_genre)

## Cleaning '24_HOUR_PEAK' Column


This section handles the cleaning and conversion of the '24_HOUR_PEAK' column. It fills missing values, splits the strings to extract numerical values, removes commas, and converts the column to an integer type.

In [43]:
df['24_HOUR_PEAK']

0        1,064\n24-hour peak
1       16,131\n24-hour peak
2           21\n24-hour peak
3           31\n24-hour peak
4          318\n24-hour peak
                ...         
2253        17\n24-hour peak
2254        13\n24-hour peak
2256       254\n24-hour peak
2257        61\n24-hour peak
2258       106\n24-hour peak
Name: 24_HOUR_PEAK, Length: 2087, dtype: object

In [44]:
df['24_HOUR_PEAK'] = df['24_HOUR_PEAK'].fillna(("0\n24-hour peak"))
df['24_HOUR_PEAK'] = df['24_HOUR_PEAK'].apply(lambda x: int(x.split('\n')[0].replace(',', ''))).astype(int)

In [45]:
df['24_HOUR_PEAK']

0        1064
1       16131
2          21
3          31
4         318
        ...  
2253       17
2254       13
2256      254
2257       61
2258      106
Name: 24_HOUR_PEAK, Length: 2087, dtype: int32

## Cleaning 'TECHNOLOGIES' Column

This section addresses the 'TECHNOLOGIES' column by filling missing values with an empty string and splitting the comma-separated technologies into lists for easier analysis.

In [46]:
df['TECHNOLOGIES']

0                                                NaN
1                                                NaN
2                                       Engine.Godot
3                                       Engine.Unity
4       Detected Technologies (?), SDK.SteamworksNET
                            ...                     
2253                                    Engine.Unity
2254                                             NaN
2256                                  Engine.Solar2D
2257             AntiCheat.PunkBuster, Engine.idTech
2258                                    Engine.Unity
Name: TECHNOLOGIES, Length: 2087, dtype: object

In [47]:
df['TECHNOLOGIES'] = df['TECHNOLOGIES'].fillna('')  # Replace NaN with empty string
df['TECHNOLOGIES'] = df['TECHNOLOGIES'].apply(lambda x: x.split(', ') if x else [])

In [48]:
df['TECHNOLOGIES']

0                                                   []
1                                                   []
2                                       [Engine.Godot]
3                                       [Engine.Unity]
4       [Detected Technologies (?), SDK.SteamworksNET]
                             ...                      
2253                                    [Engine.Unity]
2254                                                []
2256                                  [Engine.Solar2D]
2257             [AntiCheat.PunkBuster, Engine.idTech]
2258                                    [Engine.Unity]
Name: TECHNOLOGIES, Length: 2087, dtype: object

## Cleaning 'TOTAL_TWITCH_PEAK' Column

This section cleans the 'TOTAL_TWITCH_PEAK' column by splitting it into two new columns: 'TWITCH_PEAK_HOUR' and 'TWITCH_PEAK_YEAR'. It handles missing values and converts the data to appropriate numeric types.

In [None]:
df['TOTAL_TWITCH_PEAK']

In [None]:
if 'TOTAL_TWITCH_PEAK' in df.columns:
    # Extract and convert the TWITCH_PEAK_HOUR
    df['TWITCH_PEAK_HOUR'] = df['TOTAL_TWITCH_PEAK'].apply(lambda x: x.split('\n')[0].replace(',', ''))
    df['TWITCH_PEAK_HOUR'] = pd.to_numeric(df['TWITCH_PEAK_HOUR'], errors='coerce')
    df['TWITCH_PEAK_HOUR'] = df['TWITCH_PEAK_HOUR'].fillna(-1)
    df['TWITCH_PEAK_HOUR'] = df['TWITCH_PEAK_HOUR'].astype(int)
   # df['TWITCH_PEAK_HOUR'] = df['TWITCH_PEAK_HOUR'].replace(-1, pd.NA)

    # Extract and convert the TWITCH_PEAK_YEAR
    df['TWITCH_PEAK_YEAR'] = df['TOTAL_TWITCH_PEAK'].apply(lambda x: x.split()[-1])
    df['TWITCH_PEAK_YEAR'] = pd.to_numeric(df['TWITCH_PEAK_YEAR'], errors='coerce')
    df['TWITCH_PEAK_YEAR'] = df['TWITCH_PEAK_YEAR'].fillna(-1)
    df['TWITCH_PEAK_YEAR'] = df['TWITCH_PEAK_YEAR'].astype(int)
    #df['TWITCH_PEAK_YEAR'] = df['TWITCH_PEAK_YEAR'].replace(-1, pd.NA)

    # Drop the original column
    df.drop(columns=['TOTAL_TWITCH_PEAK'], inplace=True)

In [None]:
df.info()

## Calculating 'TOTAL_REVIEW' Column

This section calculates a new 'TOTAL_REVIEW' column, which represents the proportion of positive reviews out of the total number of reviews (positive and negative). This helps in understanding the overall review distribution for each game.

In [54]:
df['TOTAL_REVIEW'] = df['POSITIVE_REVIEWS'] / (df['NEGATIVE_REVIEWS'] + df['POSITIVE_REVIEWS'])
df['TOTAL_REVIEW'].head()

0    0.886759
1    0.881440
2    0.925690
3    0.889818
4    0.926667
Name: TOTAL_REVIEW, dtype: float64

## Filling Missing Values in the RATING_SCORE Column

In [56]:
mean_rating = df['RATING_SCORE'].mean()
df.loc[df['RATING_SCORE'].isnull(), 'RATING_SCORE'] = mean_rating

## Handling 'PRICE' Column with Missing Values and Formatting

This section addresses the missing values and formatting issues in the 'PRICE' column. It involves filling missing prices from a secondary dataset, replacing placeholders and formatting symbols, and converting the column to a numeric type.

In [58]:
df['PRICE'] = df['PRICE'].replace('-', '$0')

In [59]:
df[df['PRICE'].isnull() == True]

Unnamed: 0,NAME,STORE_GENRE,RATING_SCORE,N_SUPPORTED_LANGUAGES,DEVELOPERS,POSITIVE_REVIEWS,NEGATIVE_REVIEWS,TECHNOLOGIES,TOTAL_TWITCH_PEAK,PRICE,24_HOUR_PEAK,PUBLISH_YEAR,Linux,Windows,macOS,TOTAL_REVIEW
4,TaskPals,"[ Indie, Free to Play]",8.0,1,lazarche,834,66,"[Detected Technologies (?), SDK.SteamworksNET]","3,778\nall-time peak 10 months ago18 July 2023",,318,2023,0,1,0,0.926667
5,U-ena -遠花火の少女-,"[ Adventure, Casual, Free to Play]",8.0,3,HemiolaStudio,582,41,"[Engine.Lightvn, Detected Technologies (?), En...",2 years ago,,0,2022,0,1,0,0.934189
9,Rocket League,"[ Action, Indie, Racing, Sports]",8.0,13,Psyonix LLC,508263,70214,"[Engine.Unreal, SDK.EpicOnlineServices]","147,632\nall-time peak 4 years ago27 September...",,32851,2015,1,1,1,0.878623
10,Wurroom,"[ Adventure, Casual, Free to Play, Indie]",8.0,1,Michael Rfdshir,603,43,"[Engine.Unity, Detected Technologies (?), Engi...",5 years ago,,0,2019,0,1,0,0.933437
11,Kitten adventures in city park,"[ Adventure, Indie]",8.0,6,Lokator Studio,541,37,"[Engine.PyGame, Engine.RenPy, Detected Technol...",30\nall-time peak 3 years ago27 November 2021,,1,2017,1,1,1,0.935986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,WTF Do You Know?,"[ Indie, RPG, Simulation]",8.0,1,Randumb Studios,122,6,[Engine.RPGMaker],10\nall-time peak 2 years ago2 May 2022,,1,2022,0,1,0,0.953125
2238,From Madness with Love,"[ Adventure, Casual, Indie]",8.0,4,Jamsanpoid,223,17,[Engine.TyranoBuilder],132\nall-time peak 11 months ago18 July 2023,,14,2023,0,1,1,0.929167
2240,The Forest Quartet,"[ Adventure, Indie]",8.0,14,Mads & Friends,205,15,[Engine.Unity],10\nall-time peak 2 years ago9 December 2022,,3,2022,0,1,0,0.931818
2241,Purple Place - Classic Games,"[ Casual, Indie, Simulation]",8.0,26,Sam Cohle,1223,150,[Engine.Unity],58\nall-time peak 7 months ago22 October 2023,,17,2023,1,1,1,0.890750


In [60]:
null_count = df['PRICE'].isnull().sum()

print("Null count in PRICE column:", null_count)

Null count in PRICE column: 252


In [61]:
# Drop rows with null values in the 'PRICE' column
#df.dropna(subset=['PRICE'], inplace=True)
df['PRICE'] = df['PRICE'].fillna('$0')

In [62]:
df[df['PRICE'].isnull() == True]

Unnamed: 0,NAME,STORE_GENRE,RATING_SCORE,N_SUPPORTED_LANGUAGES,DEVELOPERS,POSITIVE_REVIEWS,NEGATIVE_REVIEWS,TECHNOLOGIES,TOTAL_TWITCH_PEAK,PRICE,24_HOUR_PEAK,PUBLISH_YEAR,Linux,Windows,macOS,TOTAL_REVIEW


In [63]:
null_count = df['PRICE'].isnull().sum()

print("Null count in PRICE column:", null_count)

Null count in PRICE column: 0


In [64]:
#df['PRICE'] = df['PRICE'].str.replace('€', '.')

#### Extract the price from the "price" column using regex


In [65]:
price_pattern = r'\$([\d.]+)'

df['PRICE'] = df['PRICE'].str.extract(price_pattern)

In [66]:
df.head()

Unnamed: 0,NAME,STORE_GENRE,RATING_SCORE,N_SUPPORTED_LANGUAGES,DEVELOPERS,POSITIVE_REVIEWS,NEGATIVE_REVIEWS,TECHNOLOGIES,TOTAL_TWITCH_PEAK,PRICE,24_HOUR_PEAK,PUBLISH_YEAR,Linux,Windows,macOS,TOTAL_REVIEW
0,Dragon's Dogma: Dark Arisen,"[ Action, Adventure, RPG]",8.0,7,Capcom,39271,5015,[],"27,368\nall-time peak 9 years ago17 January 2016",22.49,1064,2016,0,1,0,0.886759
1,Forza Horizon 5,"[ Action, Adventure, Racing, Simulation, Sports]",8.0,24,Playground Games,156683,21075,[],"81,096\nall-time peak 3 years ago13 November 2021",32.78,16131,2021,0,1,0,0.88144
2,Thrive,"[ Casual, Indie, Simulation, Early Access]",8.0,1,Revolutionary Games Studio,872,70,[Engine.Godot],145\nall-time peak 2 years ago24 July 2022,2.99,21,2021,1,1,0,0.92569
3,Layers of Fear (2016),"[ Adventure, Indie]",8.0,12,Bloober Team SA,22766,2819,[Engine.Unity],"145,384\nall-time peak 6 years ago13 June 2018",10.19,31,2016,1,1,1,0.889818
4,TaskPals,"[ Indie, Free to Play]",8.0,1,lazarche,834,66,"[Detected Technologies (?), SDK.SteamworksNET]","3,778\nall-time peak 10 months ago18 July 2023",0.0,318,2023,0,1,0,0.926667


In [67]:
df[df['PRICE'].isnull() == True]

Unnamed: 0,NAME,STORE_GENRE,RATING_SCORE,N_SUPPORTED_LANGUAGES,DEVELOPERS,POSITIVE_REVIEWS,NEGATIVE_REVIEWS,TECHNOLOGIES,TOTAL_TWITCH_PEAK,PRICE,24_HOUR_PEAK,PUBLISH_YEAR,Linux,Windows,macOS,TOTAL_REVIEW
511,Don't Pee,"[ Adventure, Simulation]",8.000000,1,Naughty Clogs,163,11,[Engine.Unreal],23\nall-time peak 4 months ago11 February 2024,,2,2024,0,1,0,0.936782
512,Ratopia,"[ Adventure, Casual, Indie, Simulation, Strate...",8.000000,8,Cassel Games,2043,275,[Engine.Unity],"5,772\nall-time peak 7 months ago14 November 2023",,256,2023,0,1,1,0.881363
513,Command & Conquer™ Generals Zero Hour,"[ Action, Strategy]",8.000000,9,EA Los Angeles,1384,177,[],"2,353\nall-time peak 2 months ago17 March 2024",,1200,2024,0,1,0,0.886611
516,Team Fortress Classic,[ Action],8.000000,9,Valve,7079,1070,[Engine.GoldSource],"1,897\nall-time peak 20 years ago9 August 2004",,95,1999,1,1,1,0.868696
517,Make Way,"[ Action, Casual, Indie, Racing]",8.000000,10,Ice BEAM,418,42,[Engine.Unity],347\nall-time peak last month28 April 2024,,155,2023,0,1,0,0.908696
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,ISEKAI QUEST,[],8.391241,1,Studio Ginkgo,3235,438,[Engine.Unity],4 years ago,,0,2020,0,1,1,0.880751
625,Street Legal Racing: Redline v2.3.1,[],8.391241,1,Invictus Games Ltd.,3304,448,[],7 years ago,,0,2016,0,1,0,0.880597
627,Dead Grid,"[ Indie, RPG, Strategy, Early Access]",8.000000,1,ATOM VOID,264,21,[Engine.Unity],194\nall-time peak 2 years ago26 March 2022,,4,2022,0,1,0,0.926316
629,Nexomon,"[ Adventure, Casual, Indie, RPG, Simulation]",8.000000,1,VEWO Interactive Inc.,1314,159,[Engine.Unity],208\nall-time peak 4 years ago12 July 2020,,17,2020,0,1,1,0.892057


In [68]:
df.dropna(subset=['PRICE'], inplace=True)

In [69]:
df['PRICE'] = df['PRICE'].astype(float)

## Final Dataset Preparation

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2008 entries, 0 to 2258
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   NAME                   2008 non-null   object 
 1   STORE_GENRE            2008 non-null   object 
 2   RATING_SCORE           2008 non-null   float64
 3   N_SUPPORTED_LANGUAGES  2008 non-null   int32  
 4   DEVELOPERS             2008 non-null   object 
 5   POSITIVE_REVIEWS       2008 non-null   int64  
 6   NEGATIVE_REVIEWS       2008 non-null   int64  
 7   TECHNOLOGIES           2008 non-null   object 
 8   TOTAL_TWITCH_PEAK      2008 non-null   object 
 9   PRICE                  2008 non-null   float64
 10  24_HOUR_PEAK           2008 non-null   int32  
 11  PUBLISH_YEAR           2008 non-null   int32  
 12  Linux                  2008 non-null   int64  
 13  Windows                2008 non-null   int64  
 14  macOS                  2008 non-null   int64  
 15  TOTAL_REV

In [71]:
df.to_csv('preprocessed_game_info.csv', index=False)