In [18]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# List of parameter dictionaries to test
args = {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


# Load the dataset
df = pd.read_csv('train.csv')

# Select only the soil_type columns
soil_columns = [col for col in df.columns if col.startswith('Soil_Type')]

# Count the number of 0's and 1's in each soil_type column
zero_one_counts = pd.DataFrame({
    'Zeros': (df[soil_columns] == 0).sum(),
    'Ones': (df[soil_columns] == 1).sum()
})


# Check for NaNs and fill if needed
df = df.fillna(0)  # Replace NaNs with 0, or use another method as appropriate

# Step 1: Apply log transformation to skewed columns, handling negative values by adding a constant
skewed_columns = [
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Horizontal_Distance_To_Fire_Points',
    'Hillshade_3pm',
    "Hillshade_9am",
    "Hillshade_Noon"
]

sparse_columns = zero_one_counts[zero_one_counts['Ones'] < 55].index
df = df.drop(columns=sparse_columns)

# Apply the log transformation and handle negative values by shifting
for col in skewed_columns:
    # Shift values if necessary to make them positive before applying log
    min_value = df[col].min()
    shift = 1 - min_value if min_value <= 0 else 0
    df[f'log_{col}'] = np.log(df[col] + shift + 1)

# Step 2: Drop original skewed columns
df = df.drop(columns=skewed_columns)


# Separating the target column 'Cover_Type' from the features
X = df.drop(columns=['Cover_Type'])  # Drop target and non-feature columns
y = df['Cover_Type']  # Target column

# Normalize the numerical features
scaler = StandardScaler()
X_scaled = X.copy()

# List of numerical columns to be scaled
numerical_columns = [
    'Elevation', 'Aspect', 'Slope', 'log_Horizontal_Distance_To_Hydrology',
    'log_Vertical_Distance_To_Hydrology', 'log_Horizontal_Distance_To_Roadways',
    'log_Hillshade_9am', 'log_Hillshade_Noon', 'log_Hillshade_3pm', 
    'log_Horizontal_Distance_To_Fire_Points'
]

# Apply scaling only to the numerical columns
X_scaled[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)


# Initialize the LGBMClassifier with the current parameters
rf_model = RandomForestClassifier(**args)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_train = rf_model.predict(X_train)
# Evaluate the model's performance
test_accuracy = accuracy_score(y_test, y_pred)
train_accuracy = accuracy_score(y_train, y_pred_train)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Train Accuracy: 1.0000
Test Accuracy: 0.8948


In [20]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Define the search space
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_categorical('max_depth', [None] + list(np.arange(10, 50, 10)))
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # Initialize the model with these parameters
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        random_state=42
    )
    
    # Perform cross-validation and calculate mean accuracy
    accuracy = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy').mean()

    # Stop early if accuracy meets or exceeds the threshold
    if accuracy >= 0.92:
        raise optuna.exceptions.TrialPruned()  # Stop early if threshold is reached
    
    return accuracy

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)

# Get the best hyperparameters and evaluate
best_params = study.best_params
best_rf_model = RandomForestClassifier(**best_params)
best_rf_model.fit(X_train, y_train)

y_pred = best_rf_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Final Test Accuracy with Optuna: {test_accuracy:.4f}")
print("Best parameters found by Optuna:", best_params)


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-11-07 17:18:35,422] A new study created in memory with name: no-name-f9d7e658-7565-4a9e-945b-95efe06bb59e
[I 2024-11-07 17:18:50,775] Trial 0 finished with value: 0.8435477891091712 and parameters: {'n_estimators': 112, 'max_depth': np.int64(30), 'max_features': 'sqrt', 'min_samples_split': 5, 'min_samples_leaf': 6, 'bootstrap': True}. Best is trial 0 with value: 0.8435477891091712.
[I 2024-11-07 17:19:29,108] Trial 3 finished with value: 0.8418577202216089 and parameters: {'n_estimators': 449, 'max_depth': np.int64(40), 'max_features': 'log2', 'min_samples_split': 6, 'min_samples_leaf': 6, 'bootstrap': True}. Best is trial 0 with value: 0.8435477891091712.
[I 2024-11-07 17:19:33,481] Trial 6 finished with value: 0.8661817453225936 and parameters: {'n_estimators': 388, 'max_depth': np.int64(30), 'max_features': 'sqrt', 'min_samples_split': 8, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 6 with value: 0.86618174532259

In [5]:
# Initialize the LGBMClassifier with the current parameters
RF_model = RandomForestClassifier(**args)

# Train the model
RF_model.fit(X_scaled, y)

In [4]:
df_subs = pd.read_csv('test-full.csv')
df_subs = df_subs.drop(columns=sparse_columns)

# Apply the log transformation and handle negative values by shifting
for col in skewed_columns:
    # Shift values if necessary to make them positive before applying log
    min_value = df_subs[col].min()
    shift = 1 - min_value if min_value <= 0 else 0
    df_subs[f'log_{col}'] = np.log(df_subs[col] + shift + 1)

# Step 2: Drop original skewed columns
df_subs = df_subs.drop(columns=skewed_columns)
df_subs_scaled = df_subs.copy()
df_subs_scaled[numerical_columns] = scaler.transform(df_subs[numerical_columns])

In [7]:
submission_preds = pd.DataFrame(rf_model.predict(df_subs_scaled), columns=['Cover_Type'])

In [8]:
df.to_csv('filename.csv', index=False)

In [10]:
submission_preds.to_csv('submission07-11-24.csv', index=True)

In [12]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('submission07-11-24.csv')

# Increment the 'Id' column by 1
df['Id'] = df['Id'] + 1

# Save the modified DataFrame back to CSV
df.to_csv('submission07-11-24_8.csv', index=False)
