In [4]:
import pandas as pd

# Load the merged dataset
data_path = "/Users/willwu/Documents/GitHub/Zooplankton/plankton_data/Merged_Master_All_Clean.csv"  # Update path if necessary
df = pd.read_csv(data_path)

# Display dataset info
print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (400955, 52)


Unnamed: 0,tifffile,csvfile,Year,SAM,Month,Day,repnum,Loc,SITE,DOY,...,Sigma.Intensity,Roughness,Transparency,Image.File,Particle.ID,Image.Height,Image.Width,Image.X,Image.Y,Filename
0,04072021_Huron_6_2mm_Rep4_AD_000023.tif,20210407_FISHI_006_2mm_Rep4_VC_data.csv,2021,6,4,7,4,FISHI,STA,97,...,18.4704,1.1331,0.7957,04072021_Huron_6_2mm_Rep4_AD_000001.tif,1.0,295.0,276.0,0.0,0.0,20210407_FISHI_006_2mm_Rep4_VC_data.csv
1,04072021_Huron_6_2mm_Rep4_AD_000023.tif,20210407_FISHI_006_2mm_Rep4_VC_data.csv,2021,6,4,7,4,FISHI,STA,97,...,16.2199,1.0927,0.7494,04072021_Huron_6_2mm_Rep4_AD_000001.tif,6.0,263.0,69.0,532.0,0.0,20210407_FISHI_006_2mm_Rep4_VC_data.csv
2,04072021_Huron_6_2mm_Rep4_AD_000023.tif,20210407_FISHI_006_2mm_Rep4_VC_data.csv,2021,6,4,7,4,FISHI,STA,97,...,12.1651,1.3904,0.7751,04072021_Huron_6_2mm_Rep4_AD_000001.tif,10.0,166.0,106.0,862.0,0.0,20210407_FISHI_006_2mm_Rep4_VC_data.csv
3,04072021_Huron_6_2mm_Rep4_AD_000023.tif,20210407_FISHI_006_2mm_Rep4_VC_data.csv,2021,6,4,7,4,FISHI,STA,97,...,26.3646,1.0824,0.7866,04072021_Huron_6_2mm_Rep4_AD_000001.tif,12.0,253.0,78.0,993.0,0.0,20210407_FISHI_006_2mm_Rep4_VC_data.csv
4,04072021_Huron_6_2mm_Rep4_AD_000023.tif,20210407_FISHI_006_2mm_Rep4_VC_data.csv,2021,6,4,7,4,FISHI,STA,97,...,22.0291,1.0913,0.7755,04072021_Huron_6_2mm_Rep4_AD_000001.tif,14.0,173.0,79.0,0.0,297.0,20210407_FISHI_006_2mm_Rep4_VC_data.csv


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assume df is your DataFrame and 'target' is your target column
# And your selected features are as defined
features = ["WaterT", "AvgDepth", "PRECIP", "gdd2", "DOY", "YPerchDen", 
            "BurbotDen", "OtherFishDen", "distshore", "Area..ABD.", "Aspect.Ratio", 
            "Circularity", "Perimeter", "Diameter..ABD.", "Diameter..ESD."]

# Extract features and target
X = df[features]
y = df['Class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply PCA to reduce dimensionality to 10 components
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Create and train the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_pca, y_train)

# Make predictions and evaluate the model
y_pred = rf.predict(X_test_pca)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8416904640171591
Classification Report:
               precision    recall  f1-score   support

    Bosmina_1       0.78      0.82      0.80      1390
      Bubbles       0.94      0.86      0.90       945
   Calanoid_1       0.80      0.85      0.83     16075
   Chironomid       0.78      0.56      0.65       303
   Chydoridae       0.00      0.00      0.00        18
   CopepodSpp       0.51      0.34      0.41      1725
   CountGT500       0.69      0.33      0.44      2575
      Cyclo_2       0.59      0.44      0.50      3248
  Cyclopoid_1       0.77      0.87      0.81     16973
      Daphnia       0.50      0.11      0.18        83
       Floc_1       0.93      0.94      0.93     19836
Herpacticoida       0.31      0.04      0.07       288
     LargeZ-1       0.95      0.98      0.96     15751
        other       0.48      0.22      0.30       981

     accuracy                           0.84     80191
    macro avg       0.64      0.52      0.56     80191
 weighted a

In [7]:
from imblearn.over_sampling import SMOTE
# 1. Standardize the features so that PCA and SMOTE work effectively.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2. Split the dataset into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 3. Apply SMOTE on the training data to balance the minority classes.
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 4. Apply PCA to reduce the feature space to 10 principal components.
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_smote)
X_test_pca = pca.transform(X_test)  # Use the same PCA transformation for test data.

# 5. Train the Random Forest classifier on the PCA-transformed training data.
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_pca, y_train_smote)

# 6. Make predictions and evaluate the model.
y_pred = rf.predict(X_test_pca)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.805339751343667
Classification Report:
               precision    recall  f1-score   support

    Bosmina_1       0.72      0.81      0.76      1390
      Bubbles       0.87      0.91      0.89       945
   Calanoid_1       0.84      0.78      0.81     16075
   Chironomid       0.47      0.75      0.58       303
   Chydoridae       0.07      0.28      0.11        18
   CopepodSpp       0.30      0.45      0.36      1725
   CountGT500       0.41      0.48      0.44      2575
      Cyclo_2       0.43      0.56      0.49      3248
  Cyclopoid_1       0.81      0.72      0.76     16973
      Daphnia       0.13      0.30      0.18        83
       Floc_1       0.95      0.90      0.93     19836
Herpacticoida       0.14      0.34      0.19       288
     LargeZ-1       0.96      0.97      0.97     15751
        other       0.26      0.42      0.32       981

     accuracy                           0.81     80191
    macro avg       0.53      0.62      0.56     80191
 weighted av

In [None]:
import pandas as pd

merged_df = pd.read_csv("/Users/willwu/Documents/GitHub/Zooplankton/plankton_data/Merged_Master_all.csv")

features = [
    "Loc",
    "SITE",
    "DOY",
    "gdd2",
    "WaterT",
    "LAT0",
    "LAT1",
    "LON0",
    "LON1",
    "avgdepth",
    "XANGLE",
    "PRECIP",
    "XWAVEHT",
    "wind_direction", "wind_speed",
    "CLOUD_PC",
    "AvgDepth",
    "Area..ABD.",
    "Aspect.Ratio",
    "Circularity",
    "Compactness",
    "Convexity",
    "Elongation",
    "Diameter..ABD.",
    "Diameter..ESD.",
    "Perimeter",
    "Intensity",
    "Sigma.Intensity",
    "Roughness",
    "Transparency"
]

# One-hot encode the 'Loc' and 'SITE' columns
merged_df = pd.get_dummies(merged_df, columns=['Loc', 'SITE'], prefix=['Loc', 'SITE'])

# Remove the original categorical columns from the features list
if "Loc" in features:
    features.remove("Loc")
if "SITE" in features:
    features.remove("SITE")

# Add the new one-hot encoded columns for both 'Loc' and 'SITE'
loc_dummy_cols = [col for col in merged_df.columns if col.startswith("Loc_")]
site_dummy_cols = [col for col in merged_df.columns if col.startswith("SITE_")]
features.extend(loc_dummy_cols)
features.extend(site_dummy_cols)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


# Extract features and target
X = merged_df[features]
y = merged_df['Class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply PCA 
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Create and train the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_pca, y_train)

# Make predictions and evaluate the model
y_pred = rf.predict(X_test_pca)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))