<a href="https://colab.research.google.com/github/BillyMuthiani/Machine_Learning-_and-_deep_learning/blob/main/Global_Crocodile_species_using_xgboost_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#This dataset provides detailed information on all recognized crocodile species across the globe. It includes taxonomic classification, geographic distribution, habitat details, population estimates, and conservation statuses. The dataset is designed to support tasks such as clustering, classification, ecological modeling, and conservation analysis. It can also be used for machine learning applications in species recognition, predictive habitat modeling, and biodiversity research.

In [2]:
import pandas as pd
import numpy as np


In [4]:
df=pd.read_csv('/content/crocodile_dataset.csv')
df.head(10)

Unnamed: 0,Observation ID,Common Name,Scientific Name,Family,Genus,Observed Length (m),Observed Weight (kg),Age Class,Sex,Date of Observation,Country/Region,Habitat Type,Conservation Status,Observer Name,Notes
0,1,Morelet's Crocodile,Crocodylus moreletii,Crocodylidae,Crocodylus,1.9,62.0,Adult,Male,31-03-2018,Belize,Swamps,Least Concern,Allison Hill,Cause bill scientist nation opportunity.
1,2,American Crocodile,Crocodylus acutus,Crocodylidae,Crocodylus,4.09,334.5,Adult,Male,28-01-2015,Venezuela,Mangroves,Vulnerable,Brandon Hall,Ago current practice nation determine operatio...
2,3,Orinoco Crocodile,Crocodylus intermedius,Crocodylidae,Crocodylus,1.08,118.2,Juvenile,Unknown,07-12-2010,Venezuela,Flooded Savannas,Critically Endangered,Melissa Peterson,Democratic shake bill here grow gas enough ana...
3,4,Morelet's Crocodile,Crocodylus moreletii,Crocodylidae,Crocodylus,2.42,90.4,Adult,Male,01-11-2019,Mexico,Rivers,Least Concern,Edward Fuller,Officer relate animal direction eye bag do.
4,5,Mugger Crocodile (Marsh Crocodile),Crocodylus palustris,Crocodylidae,Crocodylus,3.75,269.4,Adult,Unknown,15-07-2019,India,Rivers,Vulnerable,Donald Reid,Class great prove reduce raise author play mov...
5,6,Mugger Crocodile (Marsh Crocodile),Crocodylus palustris,Crocodylidae,Crocodylus,2.64,137.4,Adult,Male,08-06-2023,India,Reservoirs,Vulnerable,Randy Brown,Source husband at tree note responsibility def...
6,7,Siamese Crocodile,Crocodylus siamensis,Crocodylidae,Crocodylus,2.85,157.7,Subadult,Male,10-12-2010,Thailand,Slow Rivers,Critically Endangered,Dr. Marvin Thomas Jr.,Much section investment on gun young catch man...
7,8,Congo Dwarf Crocodile,Osteolaemus osborni,Crocodylidae,Osteolaemus,0.35,4.7,Juvenile,Unknown,03-08-2008,Central African Republic,Forest Swamps,Data Deficient,Terri Frazier,Race Mr environment political born itself law ...
8,9,West African Crocodile,Crocodylus suchus,Crocodylidae,Crocodylus,3.05,201.2,Adult,Male,16-04-2020,Sudan,Lakes,Least Concern,Deborah Mason,Medical blood personal success medical current...
9,10,Morelet's Crocodile,Crocodylus moreletii,Crocodylidae,Crocodylus,3.39,197.2,Adult,Male,21-05-2016,Mexico,Lagoons,Least Concern,Tamara George,Affect upon these story film around there wate...


In [5]:
df.describe(include='all')

Unnamed: 0,Observation ID,Common Name,Scientific Name,Family,Genus,Observed Length (m),Observed Weight (kg),Age Class,Sex,Date of Observation,Country/Region,Habitat Type,Conservation Status,Observer Name,Notes
count,1000.0,1000,1000,1000,1000,1000.0,1000.0,1000,1000,1000,1000,1000,1000,1000,1000
unique,,18,18,1,3,,,4,3,936,47,29,5,989,1000
top,,New Guinea Crocodile,Crocodylus novaeguineae,Crocodylidae,Crocodylus,,,Adult,Unknown,03-11-2010,Papua New Guinea,Rivers,Least Concern,Christina Davis,Ok community right then police day so store co...
freq,,68,68,1000,784,,,510,354,3,97,165,384,2,1
mean,500.5,,,,,2.41511,155.7719,,,,,,,,
std,288.819436,,,,,1.097542,175.186788,,,,,,,,
min,1.0,,,,,0.14,4.4,,,,,,,,
25%,250.75,,,,,1.6375,53.225,,,,,,,,
50%,500.5,,,,,2.43,100.6,,,,,,,,
75%,750.25,,,,,3.01,168.875,,,,,,,,


In [6]:
#checking missing values
df.isnull().sum()


Unnamed: 0,0
Observation ID,0
Common Name,0
Scientific Name,0
Family,0
Genus,0
Observed Length (m),0
Observed Weight (kg),0
Age Class,0
Sex,0
Date of Observation,0


In [15]:
#checking for duplicate rows

df.duplicated().sum()


np.int64(0)

In [12]:
#remove duplicate rows
df.drop_duplicates(inplace=True)

In [13]:
#remove missing values
df.dropna(inplace=True)

In [16]:
#handle outliers for numerical columns
numerical_columns = df.select_dtypes(include=[np.number]).columns

In [17]:
#encode categorical features
categorical_columns = df.select_dtypes(include=['object']).columns

In [19]:
#encode target variable
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
target_variable='Scientific Name'
df[target_variable] = label_encoder.fit_transform(df[target_variable])

In [20]:
#split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X = df.drop(target_variable, axis=1)
y = df[target_variable]

In [22]:
# Encode categorical features using one-hot encoding
X = pd.get_dummies(X, columns=[col for col in categorical_columns if col != target_variable])

In [23]:
#train  xgboost model
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [24]:
#evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00         8
           2       1.00      1.00      1.00         7
           3       1.00      1.00      1.00         9
           4       1.00      1.00      1.00        13
           5       1.00      1.00      1.00        17
           6       1.00      1.00      1.00         9
           7       1.00      1.00      1.00        17
           8       1.00      1.00      1.00         8
           9       1.00      1.00      1.00        10
          10       1.00      1.00      1.00         7
          11       1.00      1.00      1.00        17
          12       1.00      1.00      1.00         9
          13       1.00      1.00      1.00        10
          14       1.00      1.00      1.00         8
          15       1.00      1.00      1.00        11
          16       1.00      1.00      1.00

In [None]:
#plot visualizations
import matplotlib.pyplot as plt
import seaborn as sns

