In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy import stats as st

# Introducción




# Abre el archivo de datos y estudia la información general **(Parte 1)**

In [2]:
url = 'https://raw.githubusercontent.com/Davichobacter/data_science_tt/refs/heads/main/sprint_6/games.csv'

df = pd.read_csv(url)

In [4]:
df.sample(15)

Unnamed: 0,Name,Platform,Year_of_Release,Genre,NA_sales,EU_sales,JP_sales,Other_sales,Critic_Score,User_Score,Rating
7256,KORG DS-10 Synthesizer,DS,2008.0,Misc,0.19,0.01,0.0,0.02,82.0,8.2,E
5857,The Dukes of Hazzard II: Daisy Dukes It Out,PS,,Racing,0.17,0.11,0.0,0.02,53.0,tbd,E
1708,Mario & Sonic at the London 2012 Olympic Games,3DS,2012.0,Sports,0.18,0.64,0.27,0.09,,,
1762,Wario Land: Shake It!,Wii,2008.0,Platform,0.59,0.31,0.15,0.1,,,
2879,Power Rangers: Dino Thunder,PS2,2004.0,Action,0.35,0.27,0.0,0.09,49.0,6.2,E
5783,Demolition Racer,PS,1998.0,Racing,0.17,0.12,0.0,0.02,,,
10629,L'Aigle de Guerre,GBA,2001.0,Strategy,0.0,0.0,0.1,0.0,,,
4371,F.E.A.R. 3,X360,2011.0,Shooter,0.3,0.11,0.01,0.04,75.0,7,M
15569,UFO: Trilogy,PC,2007.0,Strategy,0.0,0.02,0.0,0.0,,,
3251,Bolt,DS,2008.0,Adventure,0.35,0.21,0.0,0.06,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16713 non-null  object 
 1   Platform         16715 non-null  object 
 2   Year_of_Release  16446 non-null  float64
 3   Genre            16713 non-null  object 
 4   NA_sales         16715 non-null  float64
 5   EU_sales         16715 non-null  float64
 6   JP_sales         16715 non-null  float64
 7   Other_sales      16715 non-null  float64
 8   Critic_Score     8137 non-null   float64
 9   User_Score       10014 non-null  object 
 10  Rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


# Descripción del Dataset

Basado en la información general obtenida en la **Parte 1**, el dataset contiene información sobre videojuegos. Las columnas presentes son:

- `Name`: Nombre del juego (Object)
- `Platform`: Plataforma del juego (Object)
- `Year_of_Release`: Año de lanzamiento (Float64)
- `Genre`: Género del juego (Object)
- `NA_sales`: Ventas en Norteamérica (Float64)
- `EU_sales`: Ventas en Europa (Float64)
- `JP_sales`: Ventas en Japón (Float64)
- `Other_sales`: Ventas en otras regiones (Float64)
- `Critic_Score`: Puntuación de los críticos (Float64)
- `User_Score`: Puntuación de los usuarios (Object)
- `Rating`: Clasificación ESRB (Object)

Se han identificado valores nulos en las columnas `Name`, `Year_of_Release`, `Genre`, `Critic_Score`, `User_Score`, y `Rating`. La columna `User_Score` tiene el tipo de dato 'Object' y contiene el valor 'tbd', lo que sugiere que deberá ser convertida a tipo numérico y manejar este valor especial.

# Sugerencias para la Preparación de Datos

Para preparar los datos adecuadamente, se sugieren los siguientes pasos:

1.  **Manejar valores nulos**:
    *   Identificar y decidir cómo manejar los valores nulos en las columnas. Para `Year_of_Release`, se podría considerar imputar con la mediana o moda, o incluso eliminar filas si el número de nulos es pequeño. Para las columnas de puntuación (`Critic_Score`, `User_Score`) y `Rating`, que tienen un número considerable de valores nulos, se deberá evaluar si es mejor imputar (considerando la distribución) o si es aceptable mantener los nulos para ciertos análisis, o incluso eliminar la columna si la cantidad de nulos es muy alta y no hay una estrategia de imputación clara. Las filas con valores nulos en `Name` y `Genre` probablemente puedan eliminarse dado su bajo recuento.
2.  **Convertir tipos de datos**:
    *   Convertir la columna `Year_of_Release` a tipo entero (`int`) ya que representa años.
    *   Convertir la columna `User_Score` a tipo numérico (`float`). Es necesario manejar el valor 'tbd' antes de la conversión, posiblemente reemplazándolo por `NaN`.
3.  **Manejar duplicados**:
    *   Verificar si existen filas duplicadas en el dataset y eliminarlas si las hay para asegurar la unicidad de las observaciones.
4.  **Calcular ventas totales**:
    *   Crear una nueva columna que represente las ventas totales sumando las ventas de todas las regiones (`NA_sales`, `EU_sales`, `JP_sales`, `Other_sales`). Esta columna será útil para los análisis posteriores.

# Prepara los datos **(Parte 2)**

## Reemplaza los nombres de las columnas

In [7]:
def cols_names_lower(df):
  df.columns = [col.lower() for col in df.columns]
  return df

In [8]:
df = cols_names_lower(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             16713 non-null  object 
 1   platform         16715 non-null  object 
 2   year_of_release  16446 non-null  float64
 3   genre            16713 non-null  object 
 4   na_sales         16715 non-null  float64
 5   eu_sales         16715 non-null  float64
 6   jp_sales         16715 non-null  float64
 7   other_sales      16715 non-null  float64
 8   critic_score     8137 non-null   float64
 9   user_score       10014 non-null  object 
 10  rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


## Tratar los valores nulos

### ['name']

In [12]:
df[df['name'].isna()]

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
659,,GEN,1993.0,,1.78,0.53,0.0,0.08,,,
14244,,GEN,1993.0,,0.0,0.0,0.03,0.0,,,


In [13]:
df.dropna(subset=['name'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16713 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             16713 non-null  object 
 1   platform         16713 non-null  object 
 2   year_of_release  16444 non-null  float64
 3   genre            16713 non-null  object 
 4   na_sales         16713 non-null  float64
 5   eu_sales         16713 non-null  float64
 6   jp_sales         16713 non-null  float64
 7   other_sales      16713 non-null  float64
 8   critic_score     8137 non-null   float64
 9   user_score       10014 non-null  object 
 10  rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.5+ MB


Se eliminaron las filas con valores nulos en la columna `name`, ya que solo eran dos y no proporcionaban información útil para el análisis.

### [year_of_release]

In [15]:
df[df['year_of_release'].isna()]

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
183,Madden NFL 2004,PS2,,Sports,4.26,0.26,0.01,0.71,94.0,8.5,E
377,FIFA Soccer 2004,PS2,,Sports,0.59,2.36,0.04,0.51,84.0,6.4,E
456,LEGO Batman: The Videogame,Wii,,Action,1.80,0.97,0.00,0.29,74.0,7.9,E10+
475,wwe Smackdown vs. Raw 2006,PS2,,Fighting,1.57,1.02,0.00,0.41,,,
609,Space Invaders,2600,,Shooter,2.36,0.14,0.00,0.03,,,
...,...,...,...,...,...,...,...,...,...,...,...
16373,PDC World Championship Darts 2008,PSP,,Sports,0.01,0.00,0.00,0.00,43.0,tbd,E10+
16405,Freaky Flyers,GC,,Racing,0.01,0.00,0.00,0.00,69.0,6.5,T
16448,Inversion,PC,,Shooter,0.01,0.00,0.00,0.00,59.0,6.7,M
16458,Hakuouki: Shinsengumi Kitan,PS3,,Adventure,0.01,0.00,0.00,0.00,,,


In [20]:
df['year_of_release'].describe()


Unnamed: 0,year_of_release
count,16444.0
mean,2006.486256
std,5.875525
min,1980.0
25%,2003.0
50%,2007.0
75%,2010.0
max,2016.0


In [21]:
df['year_of_release'].mode()

Unnamed: 0,year_of_release
0,2008.0


In [17]:
def mode_platform(df, platform):
  return df[df['platform'] == platform]['year_of_release'].mode()[0]

In [18]:
mode_platform(df, 'X360')

np.float64(2011.0)

In [22]:
for index, row in df[df['year_of_release'].isna()].iterrows():
    platform = row['platform']
    mode_year = mode_platform(df, platform)
    df.loc[index, 'year_of_release'] = mode_year

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16713 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             16713 non-null  object 
 1   platform         16713 non-null  object 
 2   year_of_release  16713 non-null  float64
 3   genre            16713 non-null  object 
 4   na_sales         16713 non-null  float64
 5   eu_sales         16713 non-null  float64
 6   jp_sales         16713 non-null  float64
 7   other_sales      16713 non-null  float64
 8   critic_score     8137 non-null   float64
 9   user_score       10014 non-null  object 
 10  rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 2.0+ MB


In [23]:
df.sample(15)

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
10035,Pocket Soccer League: Calciobit,3DS,2012.0,Sports,0.0,0.0,0.11,0.0,,,
13201,Plants vs. Zombies,PC,2009.0,Strategy,0.0,0.04,0.0,0.01,87.0,8.9,E10+
9060,Cake Mania 3,DS,2009.0,Simulation,0.13,0.0,0.0,0.01,,tbd,E
5128,Slime MoriMori Dragon Quest: Shougeki No Shipp...,GBA,2003.0,Adventure,0.0,0.0,0.36,0.01,,,
12585,Power Pro Success Legends,PSP,2010.0,Sports,0.0,0.0,0.06,0.0,,,
9097,Yu-Gi-Oh! Monster Capture GB,GB,2000.0,Role-Playing,0.0,0.0,0.14,0.0,,,
10840,Ultimate Marvel vs. Capcom 3,PS3,2011.0,Fighting,0.0,0.04,0.04,0.01,80.0,6.2,T
11279,Tokimeki Memorial: Taisen Pazurudama,SAT,1996.0,Puzzle,0.0,0.0,0.08,0.0,,,
13083,Ontamarama,DS,2007.0,Action,0.05,0.0,0.0,0.0,71.0,tbd,E
14450,Bad Apple Wars,PSV,2015.0,Action,0.0,0.0,0.03,0.0,,,


## Convierte los datos en los tipos necesarios.


### year_of_release

In [10]:
df['year_of_release'] = df['year_of_release'].astype(int)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer