In [1]:
import pandas as pd

In [2]:
from google.colab import files
uploaded = files.upload()


Saving gmi-methane-data-epa(1).xlsx to gmi-methane-data-epa(1).xlsx


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report


In [5]:
#Load & Preprocess the Data, load the dataset and convert text data into numbers.
file_path = "gmi-methane-data-epa(1).xlsx"
xls = pd.ExcelFile(file_path)
df = pd.read_excel(xls, sheet_name="gmi-ch4-data")
df_selected = df[["year", "GMI Sector", "GMI Category", "value"]].copy()

In [6]:
label_encoder_sector = LabelEncoder()
label_encoder_category = LabelEncoder()
df_selected.loc[:, "GMI Sector"] = label_encoder_sector.fit_transform(df_selected["GMI Sector"])
df_selected.loc[:, "GMI Category"] = label_encoder_category.fit_transform(df_selected["GMI Category"])

In [7]:
scaler = MinMaxScaler()
df_selected.loc[:, "value"] = scaler.fit_transform(df_selected[["value"]])

In [8]:
print("Preprocessing Completed")
df_selected.head()

Preprocessing Completed


Unnamed: 0,year,GMI Sector,GMI Category,value
0,1990,0,7,0.001731
1,1991,0,7,0.001848
2,1992,0,7,0.001966
3,1993,0,7,0.002083
4,1994,0,7,0.002201


In [9]:
#Define Methane Safety Levels into Safe, Moderate, and Dangerous.

safe_threshold = 0.3
moderate_threshold = 0.7

def classify_methane(value):
    if value < safe_threshold:
        return "Safe"
    elif value < moderate_threshold:
        return "Moderate"
    else:
        return "Dangerous"

In [10]:
df_selected["Safety Level"] = df_selected["value"].apply(classify_methane)

In [11]:
target_encoder = LabelEncoder()
df_selected["Safety Level"] = target_encoder.fit_transform(df_selected["Safety Level"])

In [12]:
#Train a Random Forest Classifier

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [13]:
# Features and labels
X = df_selected.drop(columns=["Safety Level", "value"])  # Remove target & methane value
y = df_selected["Safety Level"]  # Target variableandom_state=42)


In [14]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [15]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [16]:
# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [17]:
# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)


In [18]:
print(f"Model Accuracy: {accuracy * 100:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=target_encoder.classes_))

Model Accuracy: 89.38
Classification Report:
               precision    recall  f1-score   support

   Dangerous       0.86      1.00      0.93     47780
    Moderate       0.89      0.96      0.92     47285
        Safe       0.95      0.72      0.82     47469

    accuracy                           0.89    142534
   macro avg       0.90      0.89      0.89    142534
weighted avg       0.90      0.89      0.89    142534

