In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from shapely import wkt
import geopandas as gpd

# Load datasets
admin_data = pd.read_csv('/content/sample_data/admin_merged.csv')
soil_data = pd.read_csv('/content/sample_data/soil_merged.csv')
spatial_data = pd.read_csv('/content/sample_data/spatial_merged.csv')

# Handle missing data
admin_data = admin_data.dropna(subset=['SURFACE', 'PERIMETRE'])
soil_data = soil_data.dropna(subset=['SURFACE', 'PERIMETRE'])
spatial_data = spatial_data.dropna(subset=['SURFACE', 'PERIMETRE'])

# Combine datasets
combined_data = pd.merge(admin_data, soil_data, on=['SURFACE', 'PERIMETRE'], how='outer')
combined_data = pd.merge(combined_data, spatial_data, on=['SURFACE', 'PERIMETRE'], how='outer')

# Remove columns with empty values or '?????' columns
combined_data = combined_data.loc[:, (combined_data != '?????').any(axis=0)]  # Remove columns with '?????'
combined_data = combined_data.dropna(axis=1, how='all')  # Remove completely empty columns

# Handle categorical columns by applying one-hot encoding
categorical_columns = ['ADM_GOV', 'CLASS_FIN', 'NOM', 'TEXTURE_x', 'COULEUR_x', 'ROCHE_ME_x']
combined_data = pd.get_dummies(combined_data, columns=categorical_columns, drop_first=True)

# Handle remaining object columns that are not encoded
for col in combined_data.select_dtypes(include=['object']).columns:
    print(f"Encoding column: {col}")
    le = LabelEncoder()
    combined_data[col] = le.fit_transform(combined_data[col])

# Handle missing values
combined_data = combined_data.fillna(0)

# Convert soil geometry to GeoDataFrame
if 'geometry' in soil_data.columns:
    soil_data['geometry'] = soil_data['geometry'].apply(wkt.loads)  # Convert WKT to geometry
    soil_gdf = gpd.GeoDataFrame(soil_data, geometry='geometry')  # Create GeoDataFrame
    soil_gdf['centroid_x'] = soil_gdf.geometry.centroid.x  # Extract centroid X coordinate
    soil_gdf['centroid_y'] = soil_gdf.geometry.centroid.y  # Extract centroid Y coordinate
    soil_gdf['area'] = soil_gdf.geometry.area  # Calculate area
else:
    print("Warning: 'geometry' column is missing in soil data. Skipping spatial feature extraction.")

# Convert spatial geometry to GeoDataFrame
if 'geometry' in spatial_data.columns:
    spatial_data['geometry'] = spatial_data['geometry'].apply(wkt.loads)  # Convert WKT to geometry
    spatial_gdf = gpd.GeoDataFrame(spatial_data, geometry='geometry')  # Create GeoDataFrame
    spatial_gdf['centroid_x'] = spatial_gdf.geometry.centroid.x  # Extract centroid X coordinate
    spatial_gdf['centroid_y'] = spatial_gdf.geometry.centroid.y  # Extract centroid Y coordinate
    spatial_gdf['area'] = spatial_gdf.geometry.area  # Calculate area
else:
    print("Warning: 'geometry' column is missing in spatial data. Skipping spatial feature extraction.")

# Combine spatial features with the rest of the data
combined_data = pd.merge(combined_data, soil_gdf[['SURFACE', 'PERIMETRE', 'centroid_x', 'centroid_y', 'area']], on=['SURFACE', 'PERIMETRE'], how='left')
combined_data = pd.merge(combined_data, spatial_gdf[['SURFACE', 'PERIMETRE', 'centroid_x', 'centroid_y', 'area']], on=['SURFACE', 'PERIMETRE'], how='left')

# Normalize numerical features
scaler = StandardScaler()

# Check if 'area' column exists before normalizing
numerical_features = ['SURFACE', 'PERIMETRE']
if 'area' in combined_data.columns:
    numerical_features.append('area')

# Normalize features
combined_data[numerical_features] = scaler.fit_transform(combined_data[numerical_features])

# Define the target variable (suitability for olive trees)
def is_suitable_for_olive_trees(row):
    # Criteria based on domain knowledge
    if row['SURFACE'] > 50 and row['CLASS_FIN'] == 'Agricultural':
        if row['TEXTURE_x'] in ['sandy-loam', 'loam', 'clay-loam']:
            if row['SALURE_x'] < 10 and row['PROFOND_x'] > 50 and row['CHARG_CA_x'] > 5:
                if 10 < row['ACT_EAU_x'] < 70:
                    return 1  # Suitable land for olive trees
    return 0  # Not suitable land for olive trees

# Apply the rule to the dataset to create the target variable
combined_data['suitable_for_olive_trees'] = combined_data.apply(is_suitable_for_olive_trees, axis=1)

# Features (X) and target (Y)
columns_to_drop = ['suitable_for_olive_trees']
if 'geometry' in combined_data.columns:
    columns_to_drop.append('geometry')

X = combined_data.drop(columns=columns_to_drop)
y = combined_data['suitable_for_olive_trees']

# Check class balance in the target variable
print("Class distribution of 'suitable_for_olive_trees':")
print(y.value_counts())

# Optionally, apply class balancing techniques (e.g., SMOTE, undersampling) if needed

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42, max_depth=10, min_samples_split=10, min_samples_leaf=5)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

# Cross-validation scores
cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

# Optionally, print feature importance to understand which features matter most
importances = model.feature_importances_
features = X.columns

feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(feature_importance_df)


Encoding column: ADM_DEL
Encoding column: ?????
Encoding column: texture _C
Encoding column: texture _N
Encoding column: COULEUR_Cl
Encoding column: nom_couleu
Encoding column: geometry_x
Encoding column: geometry_y
Class distribution of 'suitable_for_olive_trees':
suitable_for_olive_trees
0    865072
Name: count, dtype: int64
Accuracy Score: 1.0
Cross-validation scores: [1. 1. 1. 1. 1.]
Mean cross-validation score: 1.0
Feature Importance:
              Feature  Importance
0             ADM_IDE         0.0
64    COULEUR_x_705.0         0.0
74   ROCHE_ME_x_107.0         0.0
73   ROCHE_ME_x_105.0         0.0
72   ROCHE_ME_x_103.0         0.0
..                ...         ...
31               TYPE         0.0
30                 ID         0.0
29        index_right         0.0
28         geometry_y         0.0
101            area_y         0.0

[102 rows x 2 columns]
