In [51]:
# libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [12]:
import pandas as pd

data = pd.read_csv('modeling_data.csv')

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# Define features and target
X = data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'cluster']]
y = data['BestTreeSpecies_encoded']

# Convert to numeric for modeling purposes (if necessary)
X = X.apply(pd.to_numeric, errors='coerce').to_numpy()
y = y.to_numpy()

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize StratifiedKFold with desired number of splits
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store accuracies for each fold
accuracy_gini_scores = []
accuracy_ent_scores = []
accuracy_nn1_scores = []

# Perform Stratified K-Fold Cross-Validation
for train_index, val_index in skf.split(X_train_val, y_train_val):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Initialize and train the decision tree (gini)
    clf_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
    clf_gini.fit(X_train, y_train)
    y_pred_gini = clf_gini.predict(X_val)
    accuracy_gini = accuracy_score(y_val, y_pred_gini)
    accuracy_gini_scores.append(accuracy_gini)

    # Initialize and train the decision tree (entropy)
    clf_ent = DecisionTreeClassifier(criterion='entropy', random_state=42)
    clf_ent.fit(X_train, y_train)
    y_pred_ent = clf_ent.predict(X_val)
    accuracy_ent = accuracy_score(y_val, y_pred_ent)
    accuracy_ent_scores.append(accuracy_ent)

    nn1 = Pipeline([
        ('scaler', StandardScaler()),  # Step 1: Standardize features
        ('mlp', MLPClassifier(hidden_layer_sizes=(128, 64, 32, 16, 8), activation='relu', solver='adam', max_iter=500, random_state=42))  # Step 2: MLP Classifier
    ])
    nn1.fit(X_train, y_train)
    y_pred_nn1 = nn1.predict(X_val)
    accuracy_nn1 = accuracy_score(y_val, y_pred_nn1)
    accuracy_nn1_scores.append(accuracy_nn1)

# Calculate and print average accuracy for each criterion
print(f"Average Decision Tree (gini) Accuracy: {np.mean(accuracy_gini_scores) * 100:.2f}%")
print(f"Average Decision Tree (entropy) Accuracy: {np.mean(accuracy_ent_scores) * 100:.2f}%")
print(f"Average NN1 Accuracy: {np.mean(accuracy_nn1_scores) * 100:.2f}%")



Average Decision Tree (gini) Accuracy: 81.42%
Average Decision Tree (entropy) Accuracy: 83.03%
Average NN1 Accuracy: 73.93%


# NN

In [63]:
data['Year.Built'] = data['Year.Built'].replace('.', np.nan)
data['Year.Built'] = data['Year.Built'].replace('1995-2001', np.nan)
data = data.dropna(subset=['Year.Built'])
data['Year.Built'] = data['Year.Built'].astype(int)

features = data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'cluster']]
target = data['BestTreeSpecies_encoded']

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# Creating the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize features
    ('mlp', MLPClassifier(hidden_layer_sizes=(128, 64, 32, 16, 8), activation='relu', solver='adam', max_iter=500, random_state=42))  # Step 2: MLP Classifier
])

# Performing cross-validation
cv_scores = cross_val_score(pipeline, features, target, cv=5, scoring='accuracy')  # Use 5-fold cross-validation

# Output the cross-validation scores
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Mean Cross-Validation Accuracy: {cv_scores.mean()}')
print(f'Standard Deviation of Cross-Validation Accuracy: {cv_scores.std()}')



Cross-Validation Accuracy Scores: [0.66937669 0.65582656 0.6875     0.7173913  0.68478261]
Mean Cross-Validation Accuracy: 0.6829754330151998
Standard Deviation of Cross-Validation Accuracy: 0.02064062450235686
