In [1]:
# Step 1: Import libraries
import pandas as pd

# Step 2: Load dataset
df = pd.read_csv("data.csv")

# Step 3: Display basic info
print("🧾 Dataset Info:\n")
print(df.info())

# Step 4: Display summary statistics
print("\n📊 Summary Statistics:\n")
print(df.describe(include='all').transpose())

# Step 5: Display first few rows
print("\n🔹 Sample Rows:\n")
print(df.head())

# Step 6: Check for missing values
print("\n❗ Missing Values:\n")
print(df.isnull().sum())

# Step 7: If you already have the 'Percent_Bleaching' column,
# classify severity and check distribution
if 'Percent_Bleaching' in df.columns:
    def classify_bleaching(p):
        if p >= 60:
            return "Severe"
        elif p >= 15:
            return "Moderate"
        else:
            return "Mild"
    
    df["Bleaching_Level"] = df["Percent_Bleaching"].apply(classify_bleaching)
    
    print("\n🌊 Bleaching Level Distribution:\n")
    print(df["Bleaching_Level"].value_counts())

# Step 8: Optional — show correlations
print("\n📈 Correlation Matrix (numerical columns only):\n")
print(df.corr(numeric_only=True))


🧾 Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1252 entries, 0 to 1251
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Site_ID              1252 non-null   int64  
 1   Latitude_Degrees     1252 non-null   float64
 2   Longitude_Degrees    1252 non-null   float64
 3   Ocean_Name           1252 non-null   object 
 4   Ecoregion_Name       1252 non-null   object 
 5   Exposure             1252 non-null   object 
 6   Turbidity            1252 non-null   float64
 7   Cyclone_Frequency    1252 non-null   float64
 8   Date_Month           1252 non-null   int64  
 9   Date_Year            1252 non-null   int64  
 10  Percent_Bleaching    1252 non-null   float64
 11  Temperature_Mean     1252 non-null   object 
 12  Windspeed            1252 non-null   object 
 13  SSTA                 1252 non-null   object 
 14  SSTA_DHW             1252 non-null   object 
 15  TSA                  

In [2]:
# ✅ STEP 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# ✅ STEP 2: Load dataset
df = pd.read_csv("data.csv")

# ✅ STEP 3: Convert numeric-like columns to floats
numeric_cols = ['Temperature_Mean', 'Windspeed', 'SSTA', 'SSTA_DHW', 'TSA', 'TSA_DHWMean']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # convert invalid entries to NaN

# ✅ STEP 4: Drop rows with missing values in those important columns
df.dropna(subset=numeric_cols, inplace=True)

# ✅ STEP 5: Encode categorical columns
# Identify which columns are categorical
cat_cols = ['Ocean', 'Location_Continent', 'Exposure']

# Use one-hot encoding for categorical variables
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# ✅ STEP 6: Create target variable
# If you already have 'Bleaching_Level' column, skip this step
if 'Percent_Bleaching' in df.columns and 'Bleaching_Level' not in df.columns:
    def classify_bleaching(p):
        if p >= 60:
            return "Severe"
        elif p >= 15:
            return "Moderate"
        else:
            return "Mild"
    df['Bleaching_Level'] = df['Percent_Bleaching'].apply(classify_bleaching)

# ✅ STEP 7: Encode target variable numerically
label_encoder = LabelEncoder()
df['Bleaching_Level_Encoded'] = label_encoder.fit_transform(df['Bleaching_Level'])

# ✅ STEP 8: Define features (X) and target (y)
X = df.drop(columns=['Percent_Bleaching', 'Bleaching_Level', 'Bleaching_Level_Encoded'])
y = df['Bleaching_Level_Encoded']

# ✅ STEP 9: Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ✅ STEP 10: Show preprocessing summary
print("✅ Preprocessing complete!\n")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print("\nTarget label mapping:")
for i, cls in enumerate(label_encoder.classes_):
    print(f"{i} -> {cls}")

print("\nSample features:\n", X_train.head())


ModuleNotFoundError: No module named 'sklearn'

In [3]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.7 MB 399.6 kB/s eta 0:00:21
   -- ---------


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# ✅ STEP 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# ✅ STEP 2: Load dataset
df = pd.read_csv("data.csv")

# ✅ STEP 3: Convert numeric-like columns to floats
numeric_cols = ['Temperature_Mean', 'Windspeed', 'SSTA', 'SSTA_DHW', 'TSA', 'TSA_DHWMean']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # convert invalid entries to NaN

# ✅ STEP 4: Drop rows with missing values in those important columns
df.dropna(subset=numeric_cols, inplace=True)

# ✅ STEP 5: Encode categorical columns
# Identify which columns are categorical
cat_cols = ['Ocean', 'Location_Continent', 'Exposure']

# Use one-hot encoding for categorical variables
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# ✅ STEP 6: Create target variable
# If you already have 'Bleaching_Level' column, skip this step
if 'Percent_Bleaching' in df.columns and 'Bleaching_Level' not in df.columns:
    def classify_bleaching(p):
        if p >= 60:
            return "Severe"
        elif p >= 15:
            return "Moderate"
        else:
            return "Mild"
    df['Bleaching_Level'] = df['Percent_Bleaching'].apply(classify_bleaching)

# ✅ STEP 7: Encode target variable numerically
label_encoder = LabelEncoder()
df['Bleaching_Level_Encoded'] = label_encoder.fit_transform(df['Bleaching_Level'])

# ✅ STEP 8: Define features (X) and target (y)
X = df.drop(columns=['Percent_Bleaching', 'Bleaching_Level', 'Bleaching_Level_Encoded'])
y = df['Bleaching_Level_Encoded']

# ✅ STEP 9: Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ✅ STEP 10: Show preprocessing summary
print("✅ Preprocessing complete!\n")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print("\nTarget label mapping:")
for i, cls in enumerate(label_encoder.classes_):
    print(f"{i} -> {cls}")

print("\nSample features:\n", X_train.head())


KeyError: "['Ocean', 'Location_Continent'] not in index"

In [5]:
# ✅ STEP 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# ✅ STEP 2: Load dataset
df = pd.read_csv("data.csv")

# ✅ STEP 3: Convert numeric-like columns to float
numeric_cols = [
    'Latitude_Degrees', 'Longitude_Degrees', 'Turbidity', 'Cyclone_Frequency',
    'Temperature_Mean', 'Windspeed', 'SSTA', 'SSTA_DHW', 'TSA', 'TSA_DHWMean'
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with missing numeric values
df.dropna(subset=numeric_cols + ['Percent_Bleaching'], inplace=True)

# ✅ STEP 4: Encode categorical columns
cat_cols = ['Ocean_Name', 'Exposure']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# ✅ STEP 5: Create severity target
def classify_bleaching(p):
    if p >= 60:
        return "Severe"
    elif p >= 15:
        return "Moderate"
    else:
        return "Mild"

df['Bleaching_Level'] = df['Percent_Bleaching'].apply(classify_bleaching)

# Encode target numerically
label_encoder = LabelEncoder()
df['Bleaching_Level_Encoded'] = label_encoder.fit_transform(df['Bleaching_Level'])

# ✅ STEP 6: Features and target
X = df.drop(columns=['Percent_Bleaching', 'Bleaching_Level', 'Bleaching_Level_Encoded'])
y = df['Bleaching_Level_Encoded']

# ✅ STEP 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Preprocessing complete!")
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print("Target label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


✅ Preprocessing complete!
X_train shape: (999, 24), X_test shape: (250, 24)
Target label mapping: {'Mild': np.int64(0), 'Moderate': np.int64(1), 'Severe': np.int64(2)}


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# ✅ STEP 1: Train Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# ✅ STEP 2: Predict on test set
y_pred = rf.predict(X_test)

# ✅ STEP 3: Classification report
print("\n🌊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# ✅ STEP 4: Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_, cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# ✅ STEP 5: Feature importance
importances = rf.feature_importances_
feature_names = X_train.columns
feat_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_df = feat_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feat_df)
plt.title("Feature Importances")
plt.show()

ModuleNotFoundError: No module named 'seaborn'

In [7]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# ✅ STEP 1: Train Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# ✅ STEP 2: Predict on test set
y_pred = rf.predict(X_test)

# ✅ STEP 3: Classification report
print("\n🌊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# ✅ STEP 4: Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_, cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# ✅ STEP 5: Feature importance
importances = rf.feature_importances_
feature_names = X_train.columns
feat_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_df = feat_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feat_df)
plt.title("Feature Importances")
plt.show()


ValueError: could not convert string to float: 'Belize and west Caribbean'

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# ✅ Load dataset
df = pd.read_csv("data.csv")

# ✅ Convert numeric columns
num_cols = ['Temperature_Mean', 'Windspeed', 'TSA']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# ✅ Drop rows with missing numeric data
df.dropna(subset=num_cols, inplace=True)

# ✅ Encode target variable
def classify_bleaching(p):
    if p >= 60:
        return "Severe"
    elif p >= 15:
        return "Moderate"
    else:
        return "Mild"

df['Bleaching_Level'] = df['Percent_Bleaching'].apply(classify_bleaching)

# Encode labels as numbers
label_encoder = LabelEncoder()
df['Bleaching_Level_enc'] = label_encoder.fit_transform(df['Bleaching_Level'])

# ✅ Select only relevant features
features = ['Ocean_Name', 'Exposure', 'Temperature_Mean', 'Windspeed', 'TSA']
X = df[features]
y = df['Bleaching_Level_enc']

# ✅ One-hot encode categorical features
X = pd.get_dummies(X, columns=['Ocean_Name', 'Exposure'], drop_first=True)

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ✅ Train Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# ✅ Test
y_pred = rf.predict(X_test)

# Decode predictions
pred_class = label_encoder.inverse_transform(y_pred)

# Quick summary
print("Sample predictions:", pred_class[:10])
print("Feature columns used:", X_train.columns.tolist())

Sample predictions: ['Moderate' 'Severe' 'Severe' 'Mild' 'Mild' 'Mild' 'Mild' 'Moderate'
 'Moderate' 'Severe']
Feature columns used: ['Temperature_Mean', 'Windspeed', 'TSA', 'Ocean_Name_Atlantic', 'Ocean_Name_Indian', 'Ocean_Name_Pacific', 'Ocean_Name_Red Sea', 'Exposure_Sheltered', 'Exposure_Sometimes']


In [17]:
# Example new coral site(s)
new_sites = pd.DataFrame([{
    'Temperature_Mean': 289.45,
    'Windspeed': 5,
    'TSA': 1.5,
    'Ocean_Name': 'Pacific',
    'Exposure': 'Sheltered'
}])

# One-hot encode categorical features (match training columns)
new_sites_encoded = pd.get_dummies(new_sites, columns=['Ocean_Name', 'Exposure'], drop_first=True)

# Add any missing columns (from training) with zeros
for col in X_train.columns:
    if col not in new_sites_encoded.columns:
        new_sites_encoded[col] = 0

# Ensure the column order matches training set
new_sites_encoded = new_sites_encoded[X_train.columns]

# Predict
pred_labels = rf.predict(new_sites_encoded)
pred_classes = label_encoder.inverse_transform(pred_labels)

print("Predicted Bleaching Levels:", pred_classes)


Predicted Bleaching Levels: ['Severe']


In [None]:
!pip install joblib