In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the data
data = pd.read_csv('F:/Landcover Classification_North Greece/CSV/sample345.csv')

# Data Preprocessing
# Separate the dependent and independent variables
X = data.drop(columns=['Veg-Code','Centroid_x','Centroid_y','ID'])  # Independent variables
y = data['Veg-Code']  # Dependent variable

# Encode categorical variables like Aspect and Slope if necessary
X['Aspect'] = LabelEncoder().fit_transform(X['Aspect'])
X['Slope'] = LabelEncoder().fit_transform(X['Slope'])

# Adjust target variable if necessary (0-based index for class labels)
y = y - 1  # Assuming 'Veg-Code' starts from 1 to N classes

# Standardize continuous features (optional but recommended for certain models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Stratified train-test split: 70% train, 30% test with stratified sampling on 'Veg-Code'
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)


In [3]:
# Initialize the classifiers with added algorithms
models = {
    'Logistic Regression': LogisticRegression(max_iter=500, solver='newton-cg', penalty='l2', class_weight='balanced'),
    'Random Forest': RandomForestClassifier(class_weight='balanced'),
    'Extra Trees': ExtraTreesClassifier(class_weight='balanced'),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced'),
    'SVM (Linear Kernel)': SVC(kernel='linear', max_iter=600000, class_weight='balanced', C=0.1),
    'SVM (RBF Kernel)': SVC(kernel='rbf', max_iter=600000, class_weight='balanced', C=0.1),
    'Gradient Boosting': GradientBoostingClassifier(),
    'HistGradient Boosting': HistGradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(algorithm='SAMME'),
    'XGBoost': xgb.XGBClassifier(eval_metric='mlogloss'),
    'LightGBM': lgb.LGBMClassifier(force_col_wise=True, class_weight='balanced'),
    'CatBoost': cb.CatBoostClassifier(verbose=0)
}


# Dictionary to store the results
results_dict = {}

In [4]:
# Train and evaluate the models
for name, model in models.items():
    print(f"Training {name}...")
    
    # Train the model on the same training data
    model.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {test_accuracy:.3f}")

    # Step 3: Calculate training accuracy 
    train_predictions = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_predictions)
    print(f"Training Accuracy: {train_accuracy:.3f}")
    
    # Calculate and store evaluation metrics
    confusion = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)  # Handle zero division issue
    accuracy = accuracy_score(y_test, y_pred)
    
    # Save results in the dictionary
    results_dict[name] = {
        'Confusion Matrix': pd.DataFrame(confusion),
        'Classification Report': pd.DataFrame(class_report).transpose(),
        'Accuracy Score': accuracy
    }
    
    print(f"{name} completed with Accuracy: {accuracy:.4f}")

Training Logistic Regression...
Test Accuracy: 0.703
Training Accuracy: 0.732
Logistic Regression completed with Accuracy: 0.7026
Training Random Forest...
Test Accuracy: 0.707
Training Accuracy: 1.000
Random Forest completed with Accuracy: 0.7070
Training Extra Trees...
Test Accuracy: 0.710
Training Accuracy: 1.000
Extra Trees completed with Accuracy: 0.7102
Training KNN...
Test Accuracy: 0.641
Training Accuracy: 0.724
KNN completed with Accuracy: 0.6405
Training Decision Tree...
Test Accuracy: 0.568
Training Accuracy: 1.000
Decision Tree completed with Accuracy: 0.5675
Training SVM (Linear Kernel)...
Test Accuracy: 0.706
Training Accuracy: 0.717
SVM (Linear Kernel) completed with Accuracy: 0.7059
Training SVM (RBF Kernel)...
Test Accuracy: 0.611
Training Accuracy: 0.609
SVM (RBF Kernel) completed with Accuracy: 0.6111
Training Gradient Boosting...
Test Accuracy: 0.704
Training Accuracy: 0.980
Gradient Boosting completed with Accuracy: 0.7037
Training HistGradient Boosting...
Test Acc

In [5]:
# Display or save the results as needed
# For example, to show accuracy for each model:
for model_name, metrics in results_dict.items():
    print(f"Accuracy for {model_name}: {metrics['Accuracy Score']:.4f}")

Accuracy for Logistic Regression: 0.7026
Accuracy for Random Forest: 0.7070
Accuracy for Extra Trees: 0.7102
Accuracy for KNN: 0.6405
Accuracy for Decision Tree: 0.5675
Accuracy for SVM (Linear Kernel): 0.7059
Accuracy for SVM (RBF Kernel): 0.6111
Accuracy for Gradient Boosting: 0.7037
Accuracy for HistGradient Boosting: 0.7266
Accuracy for AdaBoost: 0.5468
Accuracy for XGBoost: 0.7255
Accuracy for LightGBM: 0.7135
Accuracy for CatBoost: 0.7255


In [None]:
# Cross-validation for model evaluation with Stratified K-Fold
cv_results = {}  # Initialize cv_results dictionary
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Define the Stratified K-Folds

for name, model in models.items():
    print(f"Cross-validating {name}...")
    scores = cross_val_score(model, X_scaled, y, cv=kf)  # Perform cross-validation
    cv_results[name] = f"{scores.mean():.3f} ± {scores.std():.3f}"
    print(f"Cross-validation scores for {name}: {scores.mean()} ± {scores.std()}")

# Optional: Output cross-validation results
for model_name, score in cv_results.items():
    print(f"Cross-Validation Score for {model_name}: {score}")

In [None]:
# Assuming importances dictionary is filled correctly in the previous code
importances = {}
top_n_features = 10  # Number of top features to save

# Calculate feature importance for each model
for name, model in models.items():
    print(f"\nProcessing feature importances for {name}...")

    # Skip SVM models for feature importance
    if "SVM" in name:
        print(f"Skipping feature importance for {name} as it is not natively supported.")
        continue

    if hasattr(model, 'feature_importances_'):
        feature_importances = model.feature_importances_
        importances[name] = {
            'features': X.columns,
            'importances': feature_importances
        }
    elif name in ['Logistic Regression', 'KNN']:
        perm_importance = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
        importances[name] = {
            'features': X.columns,
            'importances': perm_importance.importances_mean
        }

In [None]:
# Save the top features to Excel
with pd.ExcelWriter('top_10_features_for_each_model.xlsx', engine='openpyxl') as writer:
    for name, data in importances.items():
        df = pd.DataFrame({
            'Feature': data['features'],
            'Importance': data['importances']
        })
        df.to_excel(writer, sheet_name=name[:30], index=False)

print("\nTop 10 features for each model have been saved to 'top_10_features_for_each_model.xlsx'")

In [None]:
# Assuming `cv_results` and `results_dict` are defined earlier in your code
# Save the evaluation results to Excel
try:
    with pd.ExcelWriter('classification_results.xlsx', engine='openpyxl') as writer:
        # Save cross-validation results
        cv_df = pd.DataFrame.from_dict(cv_results, orient='index', columns=['CV Score'])
        cv_df.to_excel(writer, sheet_name='Cross_Validation')

        # Save results for each model
        for model_name, metrics in results_dict.items():
            # Ensure the confusion matrix and classification report are DataFrames
            confusion_matrix_df = metrics['Confusion Matrix']
            classification_report_df = metrics['Classification Report']

            # Write confusion matrix to Excel
            confusion_matrix_df.to_excel(writer, sheet_name=f'{model_name[:28]}_Confusion', index=False)

            # Write classification report to Excel
            classification_report_df.to_excel(writer, sheet_name=f'{model_name[:28]}_Report', index=False)

    print("Classification results have been saved to 'classification_results.xlsx'")

except Exception as e:
    print(f"An error occurred while saving the results: {e}")