In [36]:
import pandas as pd
import numpy as np


In [37]:
df = pd.read_csv('source_clean_dataset.csv')
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


Reducimos a 200 filas

In [38]:
df = df.sample(n=200, random_state=66).copy()

1. Missing data: Remove or leave blank some values randomly.



In [39]:
# Introducir np.nan en 1% de la columna Year
sample_nan_year = df['Year'].sample(frac=0.02, random_state=42).index
df.loc[sample_nan_year, 'Year'] = np.nan

In [40]:
# Introducir el string 'Unknown' en otro 0.5% de la columna Year
sample_unknown_year = df['Year'].sample(frac=0.02, random_state=42).index
df.loc[sample_unknown_year, 'Year'] = 'Unknown'

  df.loc[sample_unknown_year, 'Year'] = 'Unknown'


In [41]:
# Introducir string vacío ("") en 1% de la columna Publisher
sample_empty_pub = df['Publisher'].sample(frac=0.01, random_state=42).index
df.loc[sample_empty_pub, 'Publisher'] = ""


In [42]:
# Introducir string "N/A" en otro 0.5% de la columna Publisher
sample_na_pub = df['Publisher'].sample(frac=0.01, random_state=42).index
df.loc[sample_na_pub, 'Publisher'] = "N/A"


2. Duplicated rows: Add exact or partial duplicates.



In [43]:
sample_exact_duplicates = df.sample(frac=0.2, random_state=123)
df = pd.concat([df, sample_exact_duplicates], ignore_index=True)


3. Outliers: Insert extreme values in numeric columns.



In [44]:
#Global_Sales
sample_outliers_sales = df.sample(n=3, random_state=789).index

df.loc[sample_outliers_sales, 'Global_Sales'] = 9999.0
df.loc[sample_outliers_sales, 'NA_Sales'] = 5000.0 


#Year
sample_outliers_year = df.sample(n=5, random_state=321).index

df.loc[sample_outliers_year, 'Year'] = 2050.0

4. Format inconsistencies: Change date formats, numeric formats, or units.



In [45]:
sample_unit_error = df['Global_Sales'].sample(frac=0.02, random_state=111).index

# Multiplicamos por 1000, cambiando la unidad de millones a miles
df.loc[sample_unit_error, 'EU_Sales'] = df.loc[sample_unit_error, 'EU_Sales'] * 1000

5. Typographical errors: Introduce spelling mistakes in categorical columns.



In [46]:
# Reemplazamos TODAS las instancias de 'Sports' por 'Sprots'
df.loc[df['Genre'] == 'Sports', 'Genre'] = 'Sprots'

6. Extra categories: Add unusual values in categorical columns to simulate errors.



In [47]:
sample_publisher_errors = df['Publisher'].sample(frac=0.04, random_state=555).index

error_choices = ['PERROSANXE', 'FRIJOL', 'FACHOESFERA', 'GILIPROGRES']
df.loc[sample_publisher_errors, 'Publisher'] = np.random.choice(error_choices, size=len(sample_publisher_errors))

7. Incorrect data types: Store numbers as strings or vice versa.



In [48]:
sample_rank_string = df['Rank'].sample(frac=0.1, random_state=777).index

def rank_to_string_with_suffix(rank):
    # Aseguramos que solo aplicamos esto a números válidos
    if pd.notna(rank) and isinstance(rank, (int, float)):
        rank_int = int(rank)
        if rank_int % 10 == 1 and rank_int % 100 != 11:
            suffix = 'st'
        elif rank_int % 10 == 2 and rank_int % 100 != 12:
            suffix = 'nd'
        elif rank_int % 10 == 3 and rank_int % 100 != 13:
            suffix = 'rd'
        else:
            suffix = 'th'
        return str(rank_int) + suffix
    return rank

df.loc[sample_rank_string, 'Rank'] = df.loc[sample_rank_string, 'Rank'].apply(rank_to_string_with_suffix)

 '4631st' '7057th' '11456th' '3104th' '16143rd' '16246th' '10562nd'
 '8456th' '1491st' '718th' '2549th' '11111th' '14588th' '12709th' '7834th'
 '12196th' '3245th']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[sample_rank_string, 'Rank'] = df.loc[sample_rank_string, 'Rank'].apply(rank_to_string_with_suffix)


8. No default codification of file (utf-8?)



In [49]:
sample_encoding_error = df['Name'].sample(frac=0.02, random_state=888).index

def introduce_encoding_chars(name):
    if pd.notna(name):
        name = str(name).replace('o', 'ó', 1)
        name = name.replace('e', 'é', 1)
    return name

df.loc[sample_encoding_error, 'Name'] = df.loc[sample_encoding_error, 'Name'].apply(introduce_encoding_chars)

9. Incorrect headers



In [50]:
new_columns = {}
for col in df.columns:
    if col in ['Rank', 'Name']:
        new_columns[col] = f"{col}_"
    elif col == 'Platform':
        new_columns[col] = f"{col.lower()}!!!!!!!!!!!"
    else:
        new_columns[col] = col

df.rename(columns=new_columns, inplace=True)

print(list(df.columns))

['Rank_', 'Name_', 'platform!!!!!!!!!!!', 'Year', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']


10. Extra punctuation symbols (1000€)

In [51]:
sample_format_error = df['EU_Sales'].sample(frac=0.3, random_state=222).index

def format_as_yen(value):
    if pd.notna(value):
        return "¥" + f"{value:.2f}"
    return value

df.loc[sample_format_error, 'EU_Sales'] = df.loc[sample_format_error, 'EU_Sales'].apply(format_as_yen)    

 '¥0.03' '¥0.06' '¥0.03' '¥0.02' '¥0.00' '¥0.04' '¥0.00' '¥0.01' '¥0.00'
 '¥0.04' '¥0.00' '¥0.01' '¥0.70' '¥10.00' '¥0.06' '¥0.10' '¥0.13' '¥0.00'
 '¥0.25' '¥0.05' '¥0.05' '¥0.00' '¥0.42' '¥0.01' '¥0.04' '¥0.07' '¥0.34'
 '¥0.00' '¥0.00' '¥0.07' '¥0.07' '¥0.00' '¥0.04' '¥0.00' '¥0.19' '¥0.04'
 '¥0.02' '¥0.06' '¥0.06' '¥3.90' '¥0.06' '¥0.01' '¥0.20' '¥0.03' '¥0.85'
 '¥0.20' '¥0.30' '¥0.01' '¥0.00' '¥0.85' '¥0.00' '¥1.16' '¥0.05' '¥0.08'
 '¥0.10' '¥0.00' '¥0.01' '¥0.00' '¥0.06' '¥0.00' '¥0.01' '¥0.07' '¥0.28']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[sample_format_error, 'EU_Sales'] = df.loc[sample_format_error, 'EU_Sales'].apply(format_as_yen)


In [52]:
df.to_csv('dirty_dataset.csv', index=False, encoding='latin-1')