In [None]:
!pip install word2number
from word2number import w2n
import pandas as pd
import numpy as np
import re



In [None]:
df = pd.read_csv("Messy Animal Data - Animal Dataset (1).csv")
df.head()

Unnamed: 0,ID,Animal,Height (cm),Weight (kg),Color,Lifespan (years),Diet,Habitat,Predators,Average Speed (km/h),Countries Found,Conservation Status,Family,Gestation Period (days),Top Speed (km/h),Social Structure,Offspring per Birth
0,1,Aardvark,105-130,40-65,Grey,20-30,,"Savannas, Grasslands","Lions, Hyenas",40,Africa,Least Concern,Orycteropodidae,210-240,40,Solitary,1
1,2,Aardwolf,40-50,8-14,Yellow-brown,10-12,,"Grasslands, Savannas","Lions, Leopards",24-30,Eastern and Southern Africa,Least Concern,Hyaenidae,90,40,Solitary,2-5
2,3,African Elephant,270-310,2700-6000,Grey,60-70,Herbivore,"Savannah, Forest","Lions, Hyenas",25,Africa,Vulnerable,Elephantidae,640-660,40,Herd-based,1
3,4,African Lion,80-110,120-250,Tan,10-14,Carnivore,"Grasslands, Savannas","Hyenas, Crocodiles",58,Africa,Vulnerable,Felidae,98-105,80,Group-based,2-4 (usually)
4,5,African Wild Dog,75-80,18-36,Multicolored,10-12,Carnivore,Savannahs,"Lions, Hyenas",fifty-six,Sub-Saharan Africa,Endangered,Canidae,70,56,Group-based,10-12


In [None]:
def has_numeric_part(input):

    # has_numeric_part is defined as "whether or not the first or last character is a digit"
    # if neither the first nor last character is a digit, then nothing adjacent to the hyphen (if there is a hyphen) is a digit
    # then, we can conclude that there is no numeric part of the string

    has_numeric_part = False

    try:
        float(input[0])
        has_numeric_part = True
    except (ValueError, TypeError):
        pass

    try:
        float(input[-1])
        has_numeric_part = True
    except (ValueError, TypeError):
        pass

    return has_numeric_part

def get_numeric_part(input):
    numeric_part = ""
    for char in input:
        if char == ".":
            numeric_part += char
        else:
            try:
                numeric_part += str(int(char))
            except (ValueError, TypeError):
                break

    if numeric_part != "":
        return float(numeric_part)

    for char in input[::-1]:
        try:
            numeric_part = str(float(char)) + numeric_part
        except (ValueError, TypeError):
            break

    return float(numeric_part)


def clean_speed_column(speed):
    speed_range = [x.strip() for x in speed.split("-")]
    assert(len(speed_range) <= 2 and len(speed_range) > 0)

    if (all([has_numeric_part(speed) for speed in speed_range])):
        return sum([get_numeric_part(speed) for speed in speed_range]) / len(speed_range)
    else:
        try:
            return w2n.word_to_num(speed)
        except (ValueError, TypeError):
            print(f"Something went wrong with the following query: {speed}")
            return pd.NA

df["Average Speed (km/h)"] = df["Average Speed (km/h)"].apply(clean_speed_column)

Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Varies
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Not Applicable
Something went wrong with the following query: Varies
Something went wrong with the following query: Not Applicable
Something went wrong wit

In [None]:
def clean_diet_column(diet):
    diet_keywords = {
        "Omnivore": ["Omnivore", "Scavenger"],        # Most scavengers are carnivores, but not ALL scavengers are necessarily carnivores
        "Carnivore": ["Carnivore", "Insectivore", "Piscivore"],
        "Herbivore": ["Herbivore"]
    }

    if pd.isna(diet):
        return pd.NA

    for diet_type in diet_keywords:
        if (any([keyword in diet for keyword in diet_keywords[diet_type]])):
            return diet_type

    # All of the edge cases in our data happen to be omnivores
    # But if I had the time, I would also cross-reference diets with lists of "plant" or "animal" foods

    return "Omnivore"

df["Diet"] = df["Diet"].apply(clean_diet_column)

In [None]:
df.head(100)

Unnamed: 0,ID,Animal,Height (cm),Weight (kg),Color,Lifespan (years),Diet,Habitat,Predators,Average Speed (km/h),Countries Found,Conservation Status,Family,Gestation Period (days),Top Speed (km/h),Social Structure,Offspring per Birth
0,1,Aardvark,105-130,40-65,Grey,20-30,,"Savannas, Grasslands","Lions, Hyenas",40.0,Africa,Least Concern,Orycteropodidae,210-240,40,Solitary,1
1,2,Aardwolf,40-50,8-14,Yellow-brown,10-12,,"Grasslands, Savannas","Lions, Leopards",27.0,Eastern and Southern Africa,Least Concern,Hyaenidae,90,40,Solitary,2-5
2,3,African Elephant,270-310,2700-6000,Grey,60-70,Herbivore,"Savannah, Forest","Lions, Hyenas",25.0,Africa,Vulnerable,Elephantidae,640-660,40,Herd-based,1
3,4,African Lion,80-110,120-250,Tan,10-14,Carnivore,"Grasslands, Savannas","Hyenas, Crocodiles",58.0,Africa,Vulnerable,Felidae,98-105,80,Group-based,2-4 (usually)
4,5,African Wild Dog,75-80,18-36,Multicolored,10-12,Carnivore,Savannahs,"Lions, Hyenas",56,Sub-Saharan Africa,Endangered,Canidae,70,56,Group-based,10-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,Iberian Lynx,60-70,9-14,Yellowish-brown,10-13,Carnivore,Mediterranean Scrublands,"Humans, Wildcats",75.0,"Spain, Portugal",Endangered,Felidae,63-67,70-80,Solitary,1-4
96,97,Indri,Up to 70,Up to 6,"Black, White",15-18,Herbivore,Rainforests,"Fossa, Birds of Prey",25.5,Madagascar,Critically Endangered,Indridae,120-140,24-27,Solitary,1
97,98,Japanese Giant Hornet,35-55,Up to 6,"Yellow, Black",3-6,Carnivore,"Forests, Mountains","Birds, Bees",49.0,"Japan, China, Southeast Asia",Not Evaluated,Vespidae,1 week,Not Applicable,Colony-based,Thousands
98,99,Japanese Macaque,50-60,10-14,Brown,20-30,Omnivore,"Forests, Mountains","Humans, Leopards",30,"Japan, China, North Korea",Least Concern,Cercopithecidae,173-194,30,Social groups,1


In [None]:
cleaned_df = df.dropna(axis=0, how='any', subset=["Animal", "Diet", "Average Speed (km/h)"])
cleaned_df.to_csv("Freddy Yu - Cleaned Animal Data.csv")