ACTIVITIES DATA TREATMENT

In [18]:
import pandas as pd

data = pd.read_csv('Tables/Activities.csv')

#Checks if each source matches to a single ID
source_to_sourceID = data.groupby('source')['sourceID'].nunique()
inconsistent_sources = source_to_sourceID[source_to_sourceID > 1]

#Checks the other way around
sourceID_to_source = data.groupby('sourceID')['source'].nunique()
inconsistent_sourceIDs = sourceID_to_source[sourceID_to_source > 1]

#Checks alphabet encoding errors
unique_sources = data['source'].unique()
encoding_issues = [s for s in unique_sources if not s.isascii()]


print("Inconsistent sources:")
print(inconsistent_sources)

print("\nInconsistent IDs:")
print(inconsistent_sourceIDs)

print("\nInvalid characters:")
print(encoding_issues)

Inconsistent sources:
Series([], Name: sourceID, dtype: int64)

Inconsistent IDs:
Series([], Name: source, dtype: int64)

Invalid characters:
[]


BILLIONAIRES DATA TREATMENT

In [24]:
import pandas as pd
from unidecode import unidecode

data = pd.read_csv('Tables/Billionaires.csv')

text_columns = ["last_name","first_name","industry"]

#checks for invalid characters in each collumn in the list
invalid_characters = {}
for col in text_columns:
    unique_values = data[col].unique()
    invalid_values = [val for val in unique_values if not isinstance(val, str) or not val.isascii()]
    invalid_characters[col] = invalid_values

#prints inconsistencies found
print("\nCharacter inconsistencies")
for col, issues in invalid_characters.items():
    if issues:
        print(f"{col}: {issues}")
    else:
        print(f"{col}: No encoding errors found")

#replaces invalid characters to their closest ASCII equivalent
def replace_invalid_chars(value):
    if isinstance(value, str):
        return unidecode(value)
    return value

#applies the above function
for col in text_columns:
    data[col] = data[col].apply(replace_invalid_chars)

#check for iunvalid characters again
#checks for invalid characters in each collumn in the list
invalid_characters = {}
for col in text_columns:
    unique_values = data[col].unique()
    invalid_values = [val for val in unique_values if not isinstance(val, str) or not val.isascii()]
    invalid_characters[col] = invalid_values

#prints inconsistencies found
print("\nCharacter inconsistencies")
for col, issues in invalid_characters.items():
    if issues:
        print(f"{col}: {issues}")
    else:
        print(f"{col}: No encoding errors found")
#end of second check

# Converts 'birth_date' from MM/DD/YYYY to YYYY-MM-DD format
data['birth_date'] = pd.to_datetime(data['birth_date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')

#preview
data[['birth_date']].head()

#modifies the actual .csv file
#data.to_csv('Tables/Billionaires.csv', index=False)

#print("updated csv file")


Character inconsistencies
last_name: ['Saadé', 'Saadé Zeenny', 'Bolloré', 'Seràgnoli', 'Piñera', 'Péladeau', 'Lê-Quôc', 'Österberg', 'Lindén Urnes', 'Käärmann', 'Laliberté', 'Lao Hernández', 'Hagströmer', 'Montipò']
first_name: ['François', 'Germán', 'Marie-Hélène', 'Stéphane', 'Réal']
industry: No encoding errors found

Character inconsistencies
last_name: No encoding errors found
first_name: No encoding errors found
industry: No encoding errors found


Unnamed: 0,birth_date
0,1949-03-05
1,1971-06-28
2,1964-01-12
3,1944-08-17
4,1930-08-30


CITIES DATA TREATMENT

In [26]:
import pandas as pd
from unidecode import unidecode

data = pd.read_csv('Tables/Cities.csv')

#check for invalid characters in "name"
invalid_characters_in_name = [
    name for name in data['name'].unique() 
    if not isinstance(name, str) or name != unidecode(name)
]

print("invalid characters found in name")
print(invalid_characters_in_name)

#checks if each cityID and city name is unique
cityID_to_name = data.groupby('cityID')['name'].nunique()
name_to_cityID = data.groupby('name')['cityID'].nunique()

#checks if there are non unique ids and names
cityID_issues = cityID_to_name[cityID_to_name > 1]
name_issues = name_to_cityID[name_to_cityID > 1]

print("\nCityIDs with more than 1 name")
print(cityID_issues)

print("\nNames with more than 1 CityID")
print(name_issues)


invalid characters found in name
[]

CityIDs with more than 1 name
Series([], Name: name, dtype: int64)

Names with more than 1 CityID
Series([], Name: cityID, dtype: int64)


COUNTRIES DATA TREATMENT

In [29]:
import pandas as pd
from unidecode import unidecode

data = pd.read_csv('Tables/Countries.csv')

text_columns = ['name', 'continent']

#check for invalid characters
invalid_characters = {}
for col in text_columns:
    invalid_characters[col] = [
        value for value in data[col].unique()
        if not isinstance(value, str) or value != unidecode(value)
    ]

print("invalid characters")
for col, issues in invalid_characters.items():
    print(f"{col}: {issues}")

#checks if names are repeated
duplicate_names = data['name'][data['name'].duplicated()]
print("\nduplicate names found")
print(duplicate_names.tolist())


invalid characters
name: []
continent: []

duplicate names found
[]


SOURCES OF WEALTH TREATMENT

In [36]:
import pandas as pd
from unidecode import unidecode

data = pd.read_csv('Tables/SourcesOfWealth.csv')

invalid_characters = {}

#check invalid characters in source
invalid_characters['source'] = [
    value for value in data['source'].unique()
    if not isinstance(value, str) or value != unidecode(value)
]

print("Invalid characters:")
for col, issues in invalid_characters.items():
    print(f"{col}: {issues}")

#verifies if each source is unique
duplicate_sources = data['source'][data['source'].duplicated()]
print("\nDuplicate sources found:")
print(duplicate_sources.tolist())

Invalid characters:
source: []

Duplicate sources found:
[]
