In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib




In [None]:
import os
print(os.getcwd())

In [None]:


# Try ISO-8859-1 (common for Excel exports)
df = pd.read_csv(r"C:\Users\SUBHO\Desktop\fish_data_cleaned.csv", encoding="ISO-8859-1")
print(df.head(1))


In [None]:
print(df.columns.tolist())

In [None]:
important_columns = [
    "occurrenceID",
    "eventDate",
    "individualCount",
    "sex",
    "lifeStage",
    "waterBody",
    "country",
    "stateProvince",
    "county",
    "locality",
    "decimalLatitude",
    "decimalLongitude",
    "minimumDepthInMeters",
    "maximumDepthInMeters",
    "scientificName",
    "kingdom",
    "phylum",
    "class",
    "order",
    "family",
    "genus",
    "specificEpithet",
    "identifiedBy",
    "dateIdentified",
    "basisOfRecord"
]

# Keep only selected columns
df_filtered = df[important_columns]

# Save the filtered dataset
df_filtered.to_csv(r"C:\Users\SUBHO\Desktop\fish_data_filtered.csv", index=False)

print("✅ Filtered dataset saved successfully!")
print(df_filtered.head())

In [None]:
print(df.columns.tolist())

In [None]:
df = pd.read_csv(r"C:\Users\SUBHO\Desktop\fish_data_filtered.csv", encoding="ISO-8859-1", low_memory=False)


In [None]:
print("Shape:", df.shape)

# Show first 5 rows
print(df.head())

# Show column data types
print(df.dtypes.head(20)) 

In [None]:
# 1. Replace "Unknown" with NaN
df.replace("Unknown", np.nan, inplace=True)

# 2. Fix individualCount (remove negative/invalid values)
df.loc[df["individualCount"] < 0, "individualCount"] = np.nan

# 3. Convert eventDate to datetime
df["eventDate"] = pd.to_datetime(df["eventDate"], errors="coerce")

# 4. Check missing values summary
print("Missing values per column:\n", df.isnull().sum())

# 5. Check ranges of coordinates
print("\nLatitude range:", df["decimalLatitude"].min(), "to", df["decimalLatitude"].max())
print("Longitude range:", df["decimalLongitude"].min(), "to", df["decimalLongitude"].max())

In [None]:
df = df.dropna(subset=["scientificName", "decimalLatitude", "decimalLongitude"])

In [None]:
df = df[(df["decimalLatitude"].between(-90, 90)) & 
        (df["decimalLongitude"].between(-180, 180))]

In [None]:
# Check min/max after filtering
print("Latitude range:", df["decimalLatitude"].min(), "to", df["decimalLatitude"].max())
print("Longitude range:", df["decimalLongitude"].min(), "to", df["decimalLongitude"].max())

# Quick look at first 5 rows
print(df[["decimalLatitude", "decimalLongitude", "scientificName"]].head())

# Optional: check if any invalid coordinates remain
invalid_coords = df[
    (df["decimalLatitude"] < -90) | (df["decimalLatitude"] > 90) |
    (df["decimalLongitude"] < -180) | (df["decimalLongitude"] > 180)
]
print("Number of invalid coordinates remaining:", len(invalid_coords))


In [None]:
# Fill missing categorical values with "Unknown" (optional)
categorical_cols = ["sex", "lifeStage", "waterBody", "country", "stateProvince"]
df[categorical_cols] = df[categorical_cols].fillna("Unknown")

# For counts, missing values can stay as NaN


In [None]:
print(df[categorical_cols].head(10))
print("Missing values in categorical columns:\n", df[categorical_cols].isnull().sum())



In [None]:
# Check summary stats
print(df["individualCount"].describe())

# Count missing values
print("Missing values in individualCount:", df["individualCount"].isnull().sum())


In [None]:
# Safe way to fill missing values
df["individualCount"] = df["individualCount"].fillna(0)

# Verify
print("Missing values after filling:", df["individualCount"].isnull().sum())


In [None]:
# Remove extreme outliers (e.g., counts > 100)
df.loc[df["individualCount"] > 100, "individualCount"] = np.nan


In [None]:
df.loc[df["minimumDepthInMeters"] < 0, "minimumDepthInMeters"] = np.nan
df.loc[df["maximumDepthInMeters"] < 0, "maximumDepthInMeters"] = np.nan


In [None]:
neg_min = df[df["minimumDepthInMeters"] < 0]
neg_max = df[df["maximumDepthInMeters"] < 0]

print("Negative values in minimumDepthInMeters:", len(neg_min))
print("Negative values in maximumDepthInMeters:", len(neg_max))


In [None]:
print(df[["minimumDepthInMeters", "maximumDepthInMeters"]].describe())


In [None]:
invalid_depth = df[df["minimumDepthInMeters"] > df["maximumDepthInMeters"]]
print("Rows where minDepth > maxDepth:", len(invalid_depth))

# Fix by swapping if needed (optional)
df.loc[df["minimumDepthInMeters"] > df["maximumDepthInMeters"], 
       ["minimumDepthInMeters", "maximumDepthInMeters"]] = \
       df.loc[df["minimumDepthInMeters"] > df["maximumDepthInMeters"], 
              ["maximumDepthInMeters", "minimumDepthInMeters"]].values


In [None]:
# Swap min and max where min > max
swap_idx = df["minimumDepthInMeters"] > df["maximumDepthInMeters"]
df.loc[swap_idx, ["minimumDepthInMeters", "maximumDepthInMeters"]] = \
    df.loc[swap_idx, ["maximumDepthInMeters", "minimumDepthInMeters"]].values

# Verify again
invalid_depth = df[df["minimumDepthInMeters"] > df["maximumDepthInMeters"]]
print("Rows where minDepth > maxDepth after swap:", len(invalid_depth))


In [None]:
important_cols = ["occurrenceID", "eventDate", "individualCount", 
                  "sex", "lifeStage", "waterBody", "country", 
                  "stateProvince", "decimalLatitude", "decimalLongitude", 
                  "minimumDepthInMeters", "maximumDepthInMeters", "scientificName"]

print("Missing values in important columns:\n", df[important_cols].isnull().sum())


In [None]:
print("Data shape after cleaning:", df.shape)


In [None]:
# Keep only rows with a scientific name
df_species = df[df["scientificName"].notnull()]
print("Shape for species-level analysis:", df_species.shape)

# Keep rows with valid dates
df_dates = df[df["eventDate"].notnull()]
print("Shape for date-based analysis:", df_dates.shape)



In [None]:
df.to_csv(r"C:\Users\SUBHO\Desktop\fish_data_cleaned_final.csv", index=False)


In [None]:
df_plot = df_species.sample(5000)  # sample for faster plotting if dataset is large

plt.figure(figsize=(10,6))
sns.scatterplot(
    x="decimalLongitude", 
    y="decimalLatitude", 
    hue="scientificName", 
    data=df_plot, 
    legend=False, 
    alpha=0.6
)
plt.title("Species Distribution Map (Sampled 5000 points)")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

In [None]:
top_species = df_species.groupby("scientificName")["individualCount"].sum().sort_values(ascending=False).head(20)

plt.figure(figsize=(12,6))
sns.barplot(x=top_species.values, y=top_species.index, palette="viridis")
plt.title("Top 20 Species by Individual Count")
plt.xlabel("Total Individuals")
plt.ylabel("Species")
plt.show()


In [None]:
plt.figure(figsize=(12,5))
sns.histplot(df_species["minimumDepthInMeters"].dropna(), bins=50, kde=True, color="skyblue", label="Min Depth")
sns.histplot(df_species["maximumDepthInMeters"].dropna(), bins=50, kde=True, color="salmon", label="Max Depth")
plt.title("Depth Distribution of Species")
plt.xlabel("Depth (meters)")
plt.ylabel("Frequency")
plt.legend()
plt.show()


In [None]:
country_counts = df_species["country"].value_counts().head(20)  # top 20 countries

plt.figure(figsize=(12,6))
sns.barplot(x=country_counts.values, y=country_counts.index, palette="coolwarm")
plt.title("Top 20 Countries by Number of Species Occurrences")
plt.xlabel("Number of Occurrences")
plt.ylabel("Country")
plt.show()


In [None]:
df_dates["year"] = df_dates["eventDate"].dt.year
yearly_counts = df_dates.groupby("year").size()

plt.figure(figsize=(12,5))
sns.lineplot(x=yearly_counts.index, y=yearly_counts.values)
plt.title("Number of Observations per Year")
plt.xlabel("Year")
plt.ylabel("Number of Records")
plt.show()


In [None]:
species_per_country = df_species.groupby("country")["scientificName"].nunique().sort_values(ascending=False)
print(species_per_country.head(20))


In [None]:
sns.scatterplot(
    x="minimumDepthInMeters", 
    y="individualCount", 
    hue="scientificName", 
    data=df_species.sample(5000), 
    legend=False
)
plt.title("Depth vs Species Abundance")
plt.xlabel("Minimum Depth (m)")
plt.ylabel("Individual Count")
plt.show()


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib




In [None]:
species_counts = df['scientificName'].value_counts()
df_filtered = df[df['scientificName'].isin(species_counts[species_counts>5].index)]

In [None]:
# -----------------------------
# 1. Import libraries
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
import joblib

# -----------------------------
# 2. Load cleaned data
# -----------------------------
df = pd.read_csv(r"C:\Users\SUBHO\Desktop\fish_data_cleaned_final.csv", low_memory=False)

# -----------------------------
# 3. Feature selection
# -----------------------------
features = [
    "decimalLatitude",
    "decimalLongitude",
    "minimumDepthInMeters",
    "maximumDepthInMeters",
    "waterBody",
    "country",
    "sex",
    "lifeStage"
]
target = "scientificName"

# -----------------------------
# 4. Remove species with only 1 occurrence
# -----------------------------
species_counts = df[target].value_counts()
df = df[df[target].isin(species_counts[species_counts >= 2].index)]

# -----------------------------
# 5. Encode categorical features
# -----------------------------
categorical_cols = ["waterBody", "country", "sex", "lifeStage"]
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# Encode target after filtering
le_target = LabelEncoder()
df[target] = le_target.fit_transform(df[target].astype(str))

# -----------------------------
# 6. Split data
# -----------------------------
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 7. Train XGBoost classifier
# -----------------------------
model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    eval_metric="mlogloss"  # removed deprecated use_label_encoder
)
model.fit(X_train, y_train)

# -----------------------------
# 8. Save model and label encoder
# -----------------------------
joblib.dump(model, "xgb_species_model.pkl")
joblib.dump(le_target, "label_encoder.pkl")

# -----------------------------
# 9. Optional: Check test accuracy
# -----------------------------
accuracy = model.score(X_test, y_test)
print("XGBoost test accuracy:", accuracy)