# Editing Data

Imports for the project

In [51]:
import pandas as pd
import matplotlib as mpl
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Load the CSV into a DataFrame

In [52]:
df = pd.read_csv(r'games.csv')

In [53]:
df.shape

(61326, 18)

A copy of the data for backup

In [54]:
new_df = df.iloc[:,1:].copy()

In [55]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61326 entries, 0 to 61325
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Game            61326 non-null  object 
 1   Pos             61326 non-null  int64  
 2   Genre           61326 non-null  object 
 3   Console         61326 non-null  object 
 4   Publisher       61326 non-null  object 
 5   Developer       61300 non-null  object 
 6   VGChartz Score  1216 non-null   float64
 7   Critic Score    6554 non-null   float64
 8   User Score      391 non-null    float64
 9   Total Shipped   3842 non-null   object 
 10  Total Sales     18918 non-null  object 
 11  NA Sales        12639 non-null  object 
 12  PAL Sales       12824 non-null  object 
 13  Japan Sales     6794 non-null   object 
 14  Other_Sales     15163 non-null  object 
 15  Release_Date    56668 non-null  object 
 16  Last_Update     15078 non-null  object 
dtypes: float64(3), int64(1), object

When the value Null appears in both 'Total Shipped' & 'total Sales' fields - we delete these rows
we must provide data in at least one of the fields as part of our prediction question

In [56]:
new_df.dropna(subset=['Total Shipped','Total Sales'], inplace = True ,how='all')

After checking thr data we saw a small amount of Null's values,
so we decided to delete rows when both of the fields consist Null value - it doesn't harm in our data processing

In [57]:
new_df.dropna(subset=['Release_Date','Last_Update'], inplace = True ,how='all')
new_df.dropna(subset=['Developer'], inplace = True)

Critical fields for our predict question,
therefore when the data missing - irrelevant

In [58]:
new_df.dropna(subset=['Game'], inplace = True)
new_df.dropna(subset=['Console'], inplace = True)

Replacing the fields values with an empty value to unify them and then deleting the irrelevant fields for prediction efficiency

In [59]:
new_df['Total Shipped'] = new_df['Total Shipped'].fillna("")
new_df['Total Sales'] = new_df['Total Sales'].fillna("")
new_df['Total_Sales']= new_df['Total Shipped'] + new_df['Total Sales']

Delete the 2 fields after merging

In [60]:
del new_df['Total Sales']
del new_df['Total Shipped']

Calaulates the amount of duplicates in 'Game'&'Console

In [61]:
new_df.duplicated(['Game','Console']).sum()

5160

Summing & sorting the amount of Null's, then deleted the rows with the highest amount of Null's values and drop duplicates

In [62]:
new_df['sum_of_null'] = new_df.apply(lambda x: x.isna().sum(), axis=1)
new_df = new_df.sort_values(by=['sum_of_null']).drop_duplicates(['Game','Console'], keep='first').sort_index().reset_index(drop=True)
del new_df['sum_of_null']

Checking there are no duplicates

In [63]:
new_df.duplicated(['Game','Console']).sum()

0

Due to the multiplicity of missing values, we have reset all the Null values

In [64]:
new_df.drop(['VGChartz Score','Critic Score','User Score'],axis=1, inplace = True)
new_df['NA Sales'] = new_df['NA Sales'].fillna(0)
new_df['PAL Sales'] = new_df['PAL Sales'].fillna(0)
new_df['Japan Sales'] = new_df['Japan Sales'].fillna(0)
new_df['Other_Sales'] = new_df['Other_Sales'].fillna(0)
new_df['Release_Date'] = new_df['Release_Date'].fillna(new_df['Last_Update'])
del new_df['Last_Update']

change total sales to integer  for example "51.00m" -> 51,000,000

In [65]:
def func(x):
    if (type(x) != str):
        return 0
    result = int(''.join(filter(str.isdigit, x))) * 10000
    return result
new_df['NA Sales'] = new_df.apply(lambda row: func(row['NA Sales']),axis=1)
new_df['PAL Sales'] = new_df.apply(lambda row: func(row['PAL Sales']),axis=1)
new_df['Japan Sales'] = new_df.apply(lambda row: func(row['Japan Sales']),axis=1)
new_df['Other_Sales'] = new_df.apply(lambda row: func(row['Other_Sales']),axis=1)
new_df['Total_Sales'] = new_df.apply(lambda row: func(row['Total_Sales']),axis=1)
new_df = new_df.rename(columns = {'NA Sales' : 'NA_Sales','PAL Sales' : 'PAL_Sales','Japan Sales': 'Japan_Sales','User Score':'User_Score','Critic Score' :'Critic_Score' ,'VGChartz Score':'VGChartz_Score' })

In [66]:
new_df['Total_Sales'].median()

170000.0

Replacing the month values to numeric values and splitting the release_date field for the machine learning phase

In [67]:
test = {"Jan" :1,
        "Feb" :2,
        "Mar" :3,
        "Apr" :4,
        "May" :5,
        "Jun" :6,
        "Jul" :7,
        "Aug" :8,
        "Sep" :9,
        "Oct" :10,
        "Nov" :11,
        "Dec" :12}
def str_to_month(date):
    month = date.split(" ")[1]
    
    return test[month] 

def str_to_year(date):
    year = int(date.split(" ")[2])
    if (year > 30):
        return year + 1900
    else:
        return year + 2000
    
new_df['Relese_Month'] = new_df.apply(lambda row: str_to_month(row['Release_Date']),axis=1)
new_df['Relese_Year'] = new_df.apply(lambda row: str_to_year(row['Release_Date']),axis=1)
del new_df['Release_Date']

Delete all games released before 1980

In [68]:
new_df.drop(new_df[new_df.Relese_Year < 1980].index, inplace=True)

add column of Hit games and insert '1' for every game that have total_sales greater then 1m

In [69]:
def hit_games(sales):
    if (sales >= 1000000):
        return 1
    else:
        return 0
new_df['Hit_Games'] = new_df.apply(lambda row: hit_games(row['Total_Sales']),axis=1)

In [70]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17487 entries, 0 to 17509
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Game          17487 non-null  object
 1   Pos           17487 non-null  int64 
 2   Genre         17487 non-null  object
 3   Console       17487 non-null  object
 4   Publisher     17487 non-null  object
 5   Developer     17487 non-null  object
 6   NA_Sales      17487 non-null  int64 
 7   PAL_Sales     17487 non-null  int64 
 8   Japan_Sales   17487 non-null  int64 
 9   Other_Sales   17487 non-null  int64 
 10  Total_Sales   17487 non-null  int64 
 11  Relese_Month  17487 non-null  int64 
 12  Relese_Year   17487 non-null  int64 
 13  Hit_Games     17487 non-null  int64 
dtypes: int64(9), object(5)
memory usage: 2.0+ MB


In [71]:
new_df.shape

(17487, 14)

In [72]:
new_df

Unnamed: 0,Game,Pos,Genre,Console,Publisher,Developer,NA_Sales,PAL_Sales,Japan_Sales,Other_Sales,Total_Sales,Relese_Month,Relese_Year,Hit_Games
0,God of War,1,Action,Series,Sony Computer Entertainment,SIE Santa Monica Studio,0,0,0,0,51000000,3,2005,1
1,Warriors,2,Action,Series,KOEI,Omega Force,0,0,0,0,47820000,6,1997,1
2,Devil May Cry,3,Action,Series,Capcom,Capcom,0,0,0,0,25200000,10,2001,1
3,Dynasty Warriors,4,Action,Series,KOEI,Omega Force,0,0,0,0,21150000,6,1997,1
4,Frogger,5,Action,Series,Konami,Konami,0,0,0,0,20000000,10,1981,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17505,Wand of Fortune 2 FD: Kimi ni Sasageru Epilogue,461,Visual+Novel,PSP,Idea Factory,Idea Factory,0,0,30000,0,30000,11,2012,0
17506,Infinite Stratos 2: Ignition Hearts,462,Visual+Novel,PSV,5pb,5pb. Games,0,0,30000,0,30000,2,2014,0
17507,Shin Hayarigami,463,Visual+Novel,PS3,Nippon Ichi Software,Nippon Ichi Software,0,0,40000,0,40000,8,2014,0
17508,Root Letter,464,Visual+Novel,PS4,PQube,Kadokawa Games,0,0,30000,0,30000,11,2016,0


In [73]:
new_df.describe(include = "all")

Unnamed: 0,Game,Pos,Genre,Console,Publisher,Developer,NA_Sales,PAL_Sales,Japan_Sales,Other_Sales,Total_Sales,Relese_Month,Relese_Year,Hit_Games
count,17487,17487.0,17487,17487,17487,17487,17487.0,17487.0,17487.0,17487.0,17487.0,17487.0,17487.0,17487.0
unique,12894,,18,47,1078,3297,,,,,,,,
top,Minecraft,,Action,PC,Ubisoft,Unknown,,,,,,,,
freq,11,,3212,2604,833,490,,,,,,,,
mean,,2107.135701,,,,,138874.0,79487.05,27123.01,26922.86,1302763.0,7.099159,2008.092755,0.173615
std,,1972.811022,,,,,378199.6,296874.6,94984.65,100798.6,9792054.0,3.477465,6.875727,0.378789
min,,1.0,,,,,0.0,0.0,0.0,0.0,0.0,1.0,1980.0,0.0
25%,,387.0,,,,,0.0,0.0,0.0,0.0,50000.0,4.0,2004.0,0.0
50%,,1561.0,,,,,10000.0,0.0,0.0,0.0,170000.0,8.0,2009.0,0.0
75%,,3299.0,,,,,130000.0,40000.0,10000.0,20000.0,550000.0,10.0,2013.0,0.0


In [74]:
new_df.to_csv("clean_data.csv")

In [None]:
##################################################################