In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

In [24]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Define the columns to use
water_important_cols = [
    'meteo_rain_height',
    'meteo_evapotranspiration_grid',
    'meteo_evapotranspiration_Monteith',
    'meteo_sunshine_%',
    'meteo_sunshine_duration',
    'meteo_temperature_avg',
    'meteo_temperature_min',
    'meteo_temperature_max',
    'meteo_amplitude_tn_tx',
    'meteo_humidity_avg',
    'meteo_humidity_min',
    'meteo_pressure_avg',
    'meteo_cloudiness',
    'meteo_frost_duration',
    'meteo_wetting_duration',
    'meteo_humidity_duration_below_40%',
    'meteo_humidity_duration_above_80%',
    'meteo_radiation_direct',
    'meteo_radiation',
    'meteo_radiation_IR',
    'meteo_if_snow',
    'meteo_snow_height',
    'meteo_snow_thickness_6h',
    'meteo_snow_thickness_max'
]

# Add the target column
target_column = 'piezo_groundwater_level_category'

# Load the data
# If your dataset is large, you can read it in chunks. For simplicity, we'll read it all at once here.
# Replace 'X_train_Hi5.csv' with your actual file path
file_path = 'X_train_Hi5.csv'

# Read the data
data = pd.read_csv(file_path, usecols=water_important_cols + [target_column])

# Drop rows with missing target values
data = data.dropna(subset=[target_column])

# Separate features and target variable
X = data[water_important_cols]
y = data[target_column]

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# For this dataset, let's check which columns are categorical
print("Categorical columns:", categorical_cols)

# If 'meteo_if_snow' is not numerical, treat it as categorical
# Let's assume 'meteo_if_snow' is categorical (e.g., 'Yes', 'No')
if 'meteo_if_snow' in X.columns and X['meteo_if_snow'].dtype == 'object':
    categorical_cols.append('meteo_if_snow')
    numerical_cols.remove('meteo_if_snow')

# Handle missing values and encoding
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a pipeline that includes the preprocessor and the classifier
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Encode target variable
# Since it's categorical (e.g., 'very low', 'low', etc.), we need to encode it
y_encoded = y.astype('category').cat.codes

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Categorical columns: []

Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.30      0.30    129952
           1       0.29      0.29      0.29    116354
           2       0.30      0.28      0.29    125818
           3       0.37      0.33      0.35     86402
           4       0.38      0.43      0.40    107538

    accuracy                           0.32    566064
   macro avg       0.33      0.32      0.32    566064
weighted avg       0.32      0.32      0.32    566064


Confusion Matrix:
[[39534 28255 27885 14061 20217]
 [29816 33627 18371 19368 15172]
 [30960 20165 35176  9611 29906]
 [17057 21563 10333 28101  9348]
 [18527 12174 25597  5352 45888]]

Accuracy Score:
0.32209432149014955


### 1. Data Exploration

In [3]:
list(h.columns)

['row_index',
 'piezo_station_department_code',
 'piezo_station_update_date',
 'piezo_station_investigation_depth',
 'piezo_station_department_name',
 'piezo_station_commune_code_insee',
 'piezo_station_pe_label',
 'piezo_station_bdlisa_codes',
 'piezo_station_altitude',
 'piezo_station_bss_code',
 'piezo_station_commune_name',
 'piezo_station_longitude',
 'piezo_station_latitude',
 'piezo_station_bss_id',
 'piezo_bss_code',
 'piezo_measurement_date',
 'piezo_obtention_mode',
 'piezo_status',
 'piezo_qualification',
 'piezo_continuity_code',
 'piezo_continuity_name',
 'piezo_producer_code',
 'piezo_producer_name',
 'piezo_measure_nature_code',
 'piezo_measure_nature_name',
 'meteo_id',
 'meteo_name',
 'meteo_latitude',
 'meteo_longitude',
 'meteo_altitude',
 'meteo_date',
 'meteo_rain_height',
 'meteo_DRR',
 'meteo_temperature_min',
 'meteo_time_tn',
 'meteo_temperature_max',
 'meteo_time_tx',
 'meteo_temperature_avg',
 'meteo_temperature_avg_threshold',
 'meteo_temperature_min_ground'