# 📥 Data Loading & Cleaning

## Step 1: Load GBIF occurrence data

In [2]:
import pandas as pd

# Load GBIF occurrence data (tab-separated)
df = pd.read_csv("occurrence.txt", sep="\t", low_memory=False)

# Preview the first few rows
df.head()



Unnamed: 0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory
0,3863936945,,,en,CC_BY_NC_4_0,,,,,,...,AFRICA,KEN,Kenya,KEN.19_1,Kwale,KEN.19.2_1,Lungalunga,KEN.19.2.3_1,Pongwe/Kikoneni,LC
1,3863936944,,,en,CC_BY_NC_4_0,,,,,,...,AFRICA,KEN,Kenya,KEN.19_1,Kwale,KEN.19.2_1,Lungalunga,KEN.19.2.3_1,Pongwe/Kikoneni,LC
2,3863936943,,,en,CC_BY_NC_4_0,,,,,,...,AFRICA,KEN,Kenya,KEN.19_1,Kwale,KEN.19.1_1,Kinango,KEN.19.1.2_1,Kinango,LC
3,3863936942,,,en,CC_BY_NC_4_0,,,,,,...,AFRICA,KEN,Kenya,KEN.19_1,Kwale,KEN.19.2_1,Lungalunga,KEN.19.2.3_1,Pongwe/Kikoneni,LC
4,3863936941,,,en,CC_BY_NC_4_0,,,,,,...,AFRICA,KEN,Kenya,KEN.19_1,Kwale,KEN.19.2_1,Lungalunga,KEN.19.2.3_1,Pongwe/Kikoneni,LC


## Step 2: Inspect and select relevant columns

In [3]:
# Select only useful columns for HWC analysis
columns_to_keep = [
    'scientificName', 'eventDate', 'decimalLatitude', 'decimalLongitude',
    'county', 'stateProvince', 'individualCount', 'basisOfRecord', 'occurrenceStatus'
]

df_clean = df[columns_to_keep].copy()

# Preview cleaned data
df_clean.head()


Unnamed: 0,scientificName,eventDate,decimalLatitude,decimalLongitude,county,stateProvince,individualCount,basisOfRecord,occurrenceStatus
0,"Coleura afra (Peters, 1852)",2012-09-23,-4.61532,39.35317,Kwale,Kenyan Coast,1,PRESERVED_SPECIMEN,PRESENT
1,"Coleura afra (Peters, 1852)",2012-09-23,-4.61532,39.35317,Kwale,Kenyan Coast,1,PRESERVED_SPECIMEN,PRESENT
2,"Coleura afra (Peters, 1852)",2012-09-30,-4.08171,39.48295,Kwale,Kenyan Coast,1,PRESERVED_SPECIMEN,PRESENT
3,"Coleura afra (Peters, 1852)",2012-09-23,-4.61532,39.35317,Kwale,Kenyan Coast,1,PRESERVED_SPECIMEN,PRESENT
4,"Coleura afra (Peters, 1852)",2012-09-23,-4.61532,39.35317,Kwale,Kenyan Coast,1,PRESERVED_SPECIMEN,PRESENT


## Step 3: Handle missing values / duplicates

In [4]:
# Drop rows without coordinates (important for mapping)
df_clean.dropna(subset=['decimalLatitude', 'decimalLongitude'], inplace=True)

# Optional: Remove duplicates
df_clean.drop_duplicates(inplace=True)

# Optional: Filter by country if necessary
# df_clean = df_clean[df['countryCode'] == 'KE']


## Step 4: Save cleaned dataset

In [5]:
# Save to CSV for use in the next notebooks
df_clean.to_csv("cleaned_gbif_data.csv", index=False)

print("Cleaned dataset saved as 'cleaned_gbif_data.csv'")


Cleaned dataset saved as 'cleaned_gbif_data.csv'
