In [3]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# List of parameter dictionaries to test
args = {'iterations': 1200, 'depth': 8, 'learning_rate': 0.05820566937770431, 'l2_leaf_reg': 0.009648489438947926, 'border_count': 173, 'random_strength': 4}

# Load the dataset
df = pd.read_csv('train.csv')

# Select only the soil_type columns
soil_columns = [col for col in df.columns if col.startswith('Soil_Type')]

# Count the number of 0's and 1's in each soil_type column
zero_one_counts = pd.DataFrame({
    'Zeros': (df[soil_columns] == 0).sum(),
    'Ones': (df[soil_columns] == 1).sum()
})


# Check for NaNs and fill if needed
df = df.fillna(0)  # Replace NaNs with 0, or use another method as appropriate

# Step 1: Apply log transformation to skewed columns, handling negative values by adding a constant
skewed_columns = [
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Horizontal_Distance_To_Fire_Points',
    'Hillshade_3pm',
    "Hillshade_9am",
    "Hillshade_Noon"
]

sparse_columns = zero_one_counts[zero_one_counts['Ones'] < 55].index
df = df.drop(columns=sparse_columns)

# Apply the log transformation and handle negative values by shifting
for col in skewed_columns:
    # Shift values if necessary to make them positive before applying log
    min_value = df[col].min()
    shift = 1 - min_value if min_value <= 0 else 0
    df[f'log_{col}'] = np.log(df[col] + shift + 1)

# Step 2: Drop original skewed columns
df = df.drop(columns=skewed_columns)


# Separating the target column 'Cover_Type' from the features
X = df.drop(columns=['Cover_Type'])  # Drop target and non-feature columns
y = df['Cover_Type']  # Target column

# Normalize the numerical features
scaler = StandardScaler()
X_scaled = X.copy()

# List of numerical columns to be scaled
numerical_columns = [
    'Elevation', 'Aspect', 'Slope', 'log_Horizontal_Distance_To_Hydrology',
    'log_Vertical_Distance_To_Hydrology', 'log_Horizontal_Distance_To_Roadways',
    'log_Hillshade_9am', 'log_Hillshade_Noon', 'log_Hillshade_3pm', 
    'log_Horizontal_Distance_To_Fire_Points'
]

# Apply scaling only to the numerical columns
X_scaled[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# Initialize the LGBMClassifier with the current parameters
catboost_model = CatBoostClassifier(**args)

# Train the model
catboost_model.fit(X_train, y_train)

# Make predictions
y_pred = catboost_model.predict(X_test)
y_pred_train = catboost_model.predict(X_train)
# Evaluate the model's performance
test_accuracy = accuracy_score(y_test, y_pred)
train_accuracy = accuracy_score(y_train, y_pred_train)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")



0:	learn: 1.8146785	total: 64.7ms	remaining: 1m 17s
1:	learn: 1.7054649	total: 161ms	remaining: 1m 36s
2:	learn: 1.6296098	total: 279ms	remaining: 1m 51s
3:	learn: 1.5456796	total: 337ms	remaining: 1m 40s
4:	learn: 1.4792048	total: 443ms	remaining: 1m 45s
5:	learn: 1.4152149	total: 523ms	remaining: 1m 44s
6:	learn: 1.3656090	total: 633ms	remaining: 1m 47s
7:	learn: 1.3193756	total: 692ms	remaining: 1m 43s
8:	learn: 1.2783690	total: 762ms	remaining: 1m 40s
9:	learn: 1.2375822	total: 812ms	remaining: 1m 36s
10:	learn: 1.2009711	total: 878ms	remaining: 1m 34s
11:	learn: 1.1679770	total: 986ms	remaining: 1m 37s
12:	learn: 1.1365395	total: 1.1s	remaining: 1m 40s
13:	learn: 1.1024455	total: 1.2s	remaining: 1m 41s
14:	learn: 1.0777145	total: 1.34s	remaining: 1m 45s
15:	learn: 1.0515761	total: 1.45s	remaining: 1m 47s
16:	learn: 1.0254159	total: 1.61s	remaining: 1m 52s
17:	learn: 0.9991096	total: 1.72s	remaining: 1m 52s
18:	learn: 0.9765293	total: 1.84s	remaining: 1m 54s
19:	learn: 0.9559495	to

In [4]:
# Initialize the LGBMClassifier with the current parameters
catboost_model = CatBoostClassifier(**args)

# Train the model
catboost_model.fit(X_scaled, y)

0:	learn: 1.8131758	total: 130ms	remaining: 2m 36s
1:	learn: 1.6999917	total: 262ms	remaining: 2m 36s
2:	learn: 1.6139492	total: 385ms	remaining: 2m 33s
3:	learn: 1.5313332	total: 489ms	remaining: 2m 26s
4:	learn: 1.4670314	total: 572ms	remaining: 2m 16s
5:	learn: 1.4062242	total: 728ms	remaining: 2m 24s
6:	learn: 1.3556914	total: 841ms	remaining: 2m 23s
7:	learn: 1.3085402	total: 997ms	remaining: 2m 28s
8:	learn: 1.2655519	total: 1.1s	remaining: 2m 26s
9:	learn: 1.2267718	total: 1.21s	remaining: 2m 24s
10:	learn: 1.1898284	total: 1.33s	remaining: 2m 23s
11:	learn: 1.1550781	total: 1.42s	remaining: 2m 20s
12:	learn: 1.1278516	total: 1.52s	remaining: 2m 18s
13:	learn: 1.0938937	total: 1.58s	remaining: 2m 14s
14:	learn: 1.0685314	total: 1.66s	remaining: 2m 11s
15:	learn: 1.0397696	total: 1.8s	remaining: 2m 13s
16:	learn: 1.0143223	total: 1.91s	remaining: 2m 12s
17:	learn: 0.9917264	total: 2.02s	remaining: 2m 12s
18:	learn: 0.9694480	total: 2.14s	remaining: 2m 12s
19:	learn: 0.9514011	tot

<catboost.core.CatBoostClassifier at 0x7f91501d6820>

In [5]:
df_subs = pd.read_csv('test-full.csv')
df_subs = df_subs.drop(columns=sparse_columns)

# Apply the log transformation and handle negative values by shifting
for col in skewed_columns:
    # Shift values if necessary to make them positive before applying log
    min_value = df_subs[col].min()
    shift = 1 - min_value if min_value <= 0 else 0
    df_subs[f'log_{col}'] = np.log(df_subs[col] + shift + 1)

# Step 2: Drop original skewed columns
df_subs = df_subs.drop(columns=skewed_columns)
df_subs_scaled = df_subs.copy()
df_subs_scaled[numerical_columns] = scaler.transform(df_subs[numerical_columns])

In [6]:
submission_preds = pd.DataFrame(catboost_model.predict(df_subs_scaled), columns=['Cover_Type'])

In [7]:
df.to_csv('filename.csv', index=False)

In [8]:
submission_preds.to_csv('submission07-11-24.csv', index=True)

In [10]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('submission07-11-24.csv')

# Increment the 'Id' column by 1
df['Id'] = df['Id'] + 1

# Save the modified DataFrame back to CSV
df.to_csv('submission07-11-24_9.csv', index=False)
