In [310]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [311]:
df = pd.read_csv('./forest_risks_dataset.csv')

X = df['Description']  
y = df['Priority']     

In [312]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
joblib.dump(label_encoder, 'models/label_encoder.pkl')

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=41, shuffle=True)

In [313]:
vectorizer = TfidfVectorizer(max_features=2000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

joblib.dump(vectorizer, 'models/vectorizer.pkl')

['models/vectorizer.pkl']

In [314]:
param_grid_logreg = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],  # These solvers support both L1 and L2 regularization
    'max_iter': [100, 200, 300, 500]
}

param_grid_svm = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [3, 4, 5],  # Relevant for polynomial kernel
    'coef0': [0, 0.1, 1]  # Relevant for polynomial and sigmoid kernels
}

param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
    'bootstrap': [True, False]  # Whether to use bootstrap samples
}

In [315]:
logistic_model = LogisticRegression()
gsLogisticModel = GridSearchCV(logistic_model, param_grid_logreg, cv=10, verbose=1, n_jobs=-1)
gsLogisticModel.fit(X_train_vec, y_train)

svm_model = SVC()
gsSvmModel = GridSearchCV(svm_model, param_grid_svm, cv=10, verbose=1, n_jobs=-1)
gsSvmModel.fit(X_train_vec, y_train)

rf_model = RandomForestClassifier()
gsRfModel = GridSearchCV(rf_model, param_grid_rf, cv=10, verbose=1, n_jobs=-1)
gsRfModel.fit(X_train_vec, y_train)

best_logistic_model = gsLogisticModel.best_estimator_
best_svm_model = gsSvmModel.best_estimator_
best_rf_model = gsRfModel.best_estimator_

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


800 fits failed out of a total of 1600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\aayud\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aayud\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\aayud\AppData\Roaming\Python\Python311\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.

Fitting 10 folds for each of 360 candidates, totalling 3600 fits
Fitting 10 folds for each of 216 candidates, totalling 2160 fits


In [316]:
y_pred_logistic = best_logistic_model.predict(X_test_vec)
y_pred_svm = best_svm_model.predict(X_test_vec)
y_pred_rf = best_rf_model.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred_logistic)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_logistic, target_names=label_encoder.classes_))

accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

joblib.dump(best_logistic_model, 'models/logreg_model.pkl')
joblib.dump(best_svm_model, 'models/svm_model.pkl')
joblib.dump(best_rf_model, 'models/rf_model.pkl')

Accuracy: 94.59%

Classification Report:
              precision    recall  f1-score   support

        High       0.97      1.00      0.98        30
         Low       1.00      0.56      0.71         9
      Medium       0.92      1.00      0.96        35

    accuracy                           0.95        74
   macro avg       0.96      0.85      0.89        74
weighted avg       0.95      0.95      0.94        74

SVM Accuracy: 93.24%

Classification Report:
              precision    recall  f1-score   support

        High       0.97      1.00      0.98        30
         Low       1.00      0.44      0.62         9
      Medium       0.90      1.00      0.95        35

    accuracy                           0.93        74
   macro avg       0.96      0.81      0.85        74
weighted avg       0.94      0.93      0.92        74

Random Forest Accuracy: 91.89%

Classification Report:
              precision    recall  f1-score   support

        High       0.97      0.97      0.9

['models/rf_model.pkl']

In [317]:
y.value_counts()

Priority
Medium    174
High      148
Low        45
Name: count, dtype: int64

In [327]:
logreg_model = joblib.load('models/logreg_model.pkl')
svm_model = joblib.load('models/svm_model.pkl')
rf_model = joblib.load('models/rf_model.pkl')

vectorizer = joblib.load('models/vectorizer.pkl')

label_encoder = joblib.load('models/label_encoder.pkl')

def predict_priority(text):
    """Preprocess text and predict priority using all models."""
    
    text_vec = vectorizer.transform([text])

    logreg_pred = logreg_model.predict(text_vec)[0]
    svm_pred = svm_model.predict(text_vec)[0]
    rf_pred = rf_model.predict(text_vec)[0]

    logreg_priority = label_encoder.inverse_transform([logreg_pred])[0]
    svm_priority = label_encoder.inverse_transform([svm_pred])[0]
    rf_priority = label_encoder.inverse_transform([rf_pred])[0]

    return logreg_priority, svm_priority, rf_priority

new_texts = [
    "A massive wildfire is approaching a nearby village, causing panic.",
    "A group of armed poachers was seen near the protected tiger reserve.",
    "A rare medicinal plant species is being stolen from the forest.",
    "A tourist was injured while hiking and is fine.",
    "Poachers are spotted near a protected area, hunting endangered species in broad daylight",
    "An illegal logging operation that devastates large sections of the forest, destroying habitats and causing irreversible damage",
    "A camper found dead after lethal bear attack near campfire",
    "A minor puddle forms on a hiking path after light rain, causing a slight inconvenience but no hazard"
]

for text in new_texts:
    logreg_pred, svm_pred, rf_pred = predict_priority(text)
    
    print(f"\nText: {text}")
    print(f"🔹 Logistic Regression Prediction: {logreg_pred}")
    print(f"🔹 SVM Prediction: {svm_pred}")
    print(f"🔹 Random Forest Prediction: {rf_pred}")


Text: A massive wildfire is approaching a nearby village, causing panic.
🔹 Logistic Regression Prediction: High
🔹 SVM Prediction: High
🔹 Random Forest Prediction: High

Text: A group of armed poachers was seen near the protected tiger reserve.
🔹 Logistic Regression Prediction: Low
🔹 SVM Prediction: Low
🔹 Random Forest Prediction: Medium

Text: A rare medicinal plant species is being stolen from the forest.
🔹 Logistic Regression Prediction: High
🔹 SVM Prediction: High
🔹 Random Forest Prediction: Medium

Text: A tourist was injured while hiking and is fine.
🔹 Logistic Regression Prediction: Medium
🔹 SVM Prediction: Medium
🔹 Random Forest Prediction: Medium

Text: Poachers are spotted near a protected area, hunting endangered species in broad daylight
🔹 Logistic Regression Prediction: Medium
🔹 SVM Prediction: Medium
🔹 Random Forest Prediction: Medium

Text: An illegal logging operation that devastates large sections of the forest, destroying habitats and causing irreversible damage
🔹 Log

In [319]:
import pandas as pd
df = pd.read_csv('./trial.csv')
print(df['Priority'].value_counts(normalize=True) * 100)  # Check class distribution

Priority
Medium    40.625
High      30.000
Low       29.375
Name: proportion, dtype: float64


Class weights

In [320]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load dataset
df = pd.read_csv('./trial.csv')
X = df['Description']  
y = df['Priority']  

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=41, shuffle=True)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=2000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Save vectorizer & label encoder
joblib.dump(vectorizer, 'models/vectorizer_with_weights.pkl')
joblib.dump(label_encoder, 'models/label_encoder_with_weights.pkl')

# Compute class weights
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight('balanced', classes=np.unique(y_encoded), y=y_encoded)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
print("Computed Class Weights:", class_weight_dict)

# Logistic Regression with Class Weights
logistic_model = LogisticRegression(class_weight=class_weight_dict, max_iter=500, solver='liblinear')
logistic_model.fit(X_train_vec, y_train)
joblib.dump(logistic_model, 'models/logreg_model_with_weights.pkl')

# SVM with Class Weights
svm_model = SVC(class_weight=class_weight_dict, kernel='linear')
svm_model.fit(X_train_vec, y_train)
joblib.dump(svm_model, 'models/svm_model_with_weights.pkl')

# Random Forest with Class Weights
rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=200, max_depth=20, random_state=41)
rf_model.fit(X_train_vec, y_train)
joblib.dump(rf_model, 'models/rf_model_with_weights.pkl')

# Predictions
y_pred_logistic = logistic_model.predict(X_test_vec)
y_pred_svm = svm_model.predict(X_test_vec)
y_pred_rf = rf_model.predict(X_test_vec)

# Accuracy and Reports
accuracy_log = accuracy_score(y_test, y_pred_logistic)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Logistic Regression Accuracy: {accuracy_log * 100:.2f}%")
print(classification_report(y_test, y_pred_logistic, target_names=label_encoder.classes_))

print(f"SVM Accuracy: {accuracy_svm * 100:.2f}%")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

# Inference Code
def predict_priority(description):
    vectorizer = joblib.load('models/vectorizer_with_weights.pkl')
    label_encoder = joblib.load('models/label_encoder_with_weights.pkl')
    logistic_model = joblib.load('models/logreg_model_with_weights.pkl')
    svm_model = joblib.load('models/svm_model_with_weights.pkl')
    rf_model = joblib.load('models/rf_model_with_weights.pkl')
    
    description_vec = vectorizer.transform([description])
    
    pred_log = logistic_model.predict(description_vec)[0]
    pred_svm = svm_model.predict(description_vec)[0]
    pred_rf = rf_model.predict(description_vec)[0]
    
    log_label = label_encoder.inverse_transform([pred_log])[0]
    svm_label = label_encoder.inverse_transform([pred_svm])[0]
    rf_label = label_encoder.inverse_transform([pred_rf])[0]
    
    return {
        "Logistic Regression Prediction": log_label,
        "SVM Prediction": svm_label,
        "Random Forest Prediction": rf_label
    }

# Example usage
low_risk = "A small fallen tree branch is blocking part of the trail."
medium_risk = "A group of hikers reported seeing a bear near the campsite."
high_risk = "A large wildfire spreading rapidly near the forest reserve."

print("Low Risk Prediction:", predict_priority(low_risk))
print("Medium Risk Prediction:", predict_priority(medium_risk))
print("High Risk Prediction:", predict_priority(high_risk))

Computed Class Weights: {0: 1.1111111111111112, 1: 1.1347517730496455, 2: 0.8205128205128205}
Logistic Regression Accuracy: 68.75%
              precision    recall  f1-score   support

        High       0.94      0.75      0.83        20
         Low       0.60      0.67      0.63        18
      Medium       0.61      0.65      0.63        26

    accuracy                           0.69        64
   macro avg       0.71      0.69      0.70        64
weighted avg       0.71      0.69      0.69        64

SVM Accuracy: 62.50%
              precision    recall  f1-score   support

        High       0.87      0.65      0.74        20
         Low       0.57      0.72      0.63        18
      Medium       0.54      0.54      0.54        26

    accuracy                           0.62        64
   macro avg       0.66      0.64      0.64        64
weighted avg       0.65      0.62      0.63        64

Random Forest Accuracy: 60.94%
              precision    recall  f1-score   support



Random Over Sampling