In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split

cols_to_drop = [
    'system:index', 
    'BUFF_DIST',
    'Carbonate',
    'Cliff',
    'ComID',
    '.geo',
    'Connection',
    'FCode',
    'FDate',
    'FType',
    'GNIS_ID',
    'GNIS_Name',
    'In_NWI',
    'Join_Count',
    'OBJECTID',
    'ORIG_FID',
    'Permanent_',
    'ReachCode',
    'Resolution',
    'TARGET_FID',
    'Shape_Area',
#     'Hectares',
    'AreaSqKm',
    'Alkaline intrusive',
    'Silicic residual',
    'Saline lake sediment',
    'Non-carbonate',
    'Glacial till coarse',
    'Extrusive volcanic',
    'Eolian sediment fine',
    'Mountain/divide',
    'Coastal sediment coarse',
    'lagoslakei',
    'Peak/ridge (cool)',
    'Peak/ridge (warm)',
    'Peak/ridge',
    'Alluvium and coastal sediment fine',
    'Eolian sediment coarse',
    'Upper slope (cool)',
    'Lower slope (cool)',
    'Water',
    'Hydric',
    'Elevation',
    'Glacial till clay',
    'Colluvial sediment',
    'Glacial outwash coarse',
#     'physiography',
]

data = pd.read_csv('lakes_training_mode.csv').drop(cols_to_drop, 1)
labels = data.iloc[:,-1]
data = data.drop('res', 1)

train_feas, test_feas, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.3, shuffle=True, random_state=0)

print(train_feas.shape)
print(test_feas.shape)

(942, 16)
(404, 16)


In [134]:
## normalize features
# from sklearn.preprocessing import StandardScaler
# scale = StandardScaler()
# scale.fit(train_feas)

# train_scaled_feas = train_feas.copy()
# train_scaled_feas[train_scaled_feas.columns] = scale.transform(train_feas)

# test_scaled_feas = test_feas.copy()
# test_scaled_feas[test_scaled_feas.columns] = scale.transform(test_feas)

# train_feas = train_scaled_feas
# test_feas = test_scaled_feas

In [2]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

# Build a forest and compute the feature importances
forest = RandomForestClassifier(n_estimators=600, max_features=0.1)

forest.fit(train_feas, train_labels)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(train_feas.shape[1]):
    print("%d | feature %d | %s | %f" % (f + 1, indices[f], list(train_feas.columns.values)[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(train_feas.shape[1]), importances[indices],
       color="deepskyblue", yerr=std[indices], align="center")
plt.xticks(range(train_feas.shape[1]), indices)
plt.xlim([-1, train_feas.shape[1]])
plt.show()


  from numpy.core.umath_tests import inner1d


Feature ranking:
1 | feature 6 | Shape_Leng | 0.166233
2 | feature 2 | Hectares | 0.103353
3 | feature 10 | Valley | 0.078437
4 | feature 11 | Valley (narrow) | 0.068435
5 | feature 14 | ned | 0.068125
6 | feature 13 | mtpi | 0.068059
7 | feature 4 | Lower slope (flat) | 0.062604
8 | feature 12 | chili | 0.061543
9 | feature 7 | Upper slope | 0.059050
10 | feature 3 | Lower slope | 0.058670
11 | feature 15 | physiography | 0.043803
12 | feature 5 | Lower slope (warm) | 0.041688
13 | feature 8 | Upper slope (flat) | 0.036670
14 | feature 9 | Upper slope (warm) | 0.035355
15 | feature 1 | Glacial till loam | 0.024403
16 | feature 0 | Glacial lake sediment fine | 0.023573


<Figure size 800x600 with 1 Axes>

In [3]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=30)
cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy') 
predictions = model.fit(train_feas, train_labels).predict(test_feas)
acc = accuracy_score(predictions, test_labels)
auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('KNN')
print("Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)" % (acc, auc, ap, f1, cv.mean(), cv.std() * 2))


KNN
Accuracy = 0.755, AUC = 0.758, AP = 0.650, F1 = 0.744, 10-Fold CV: 0.741 (+/- 0.074)


In [4]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()
cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy') 
predictions = model.fit(train_feas, train_labels).predict(test_feas)
acc = accuracy_score(predictions, test_labels)
auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('Neural Network')
print("Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)" % (acc, auc, ap, f1, cv.mean(), cv.std() * 2))


Neural Network
Accuracy = 0.587, AUC = 0.681, AP = 0.916, F1 = 0.700, 10-Fold CV: 0.645 (+/- 0.284)


In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=0.0001)
cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy') 
predictions = model.fit(train_feas, train_labels).predict(test_feas)
acc = accuracy_score(predictions, test_labels)
auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('Logistic Regression')
print("Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)" % (acc, auc, ap, f1, cv.mean(), cv.std() * 2))


Logistic Regression
Accuracy = 0.770, AUC = 0.780, AP = 0.635, F1 = 0.749, 10-Fold CV: 0.769 (+/- 0.064)


In [11]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', C=1, gamma=0.0001)
cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy') 
predictions = model.fit(train_feas, train_labels).predict(test_feas)
acc = accuracy_score(predictions, test_labels)
# auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('SVC with rbf')
print("Accuracy = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)" % (acc, ap, f1, cv.mean(), cv.std() * 2))


SVC with rbf
Accuracy = 0.693, AP = 0.700, F1 = 0.716, 10-Fold CV: 0.687 (+/- 0.089)


In [None]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy') 
predictions = model.fit(train_feas, train_labels).predict(test_feas)
acc = accuracy_score(predictions, test_labels)
auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('SVC with linear')
print("Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)" % (acc, auc, ap, f1, cv.mean(), cv.std() * 2))


In [None]:
from sklearn.svm import SVC

model = SVC(kernel='poly')
cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy') 
predictions = model.fit(train_feas, train_labels).predict(test_feas)
acc = accuracy_score(predictions, test_labels)
auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('SVC with poly')
print("Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)" % (acc, auc, ap, f1, cv.mean(), cv.std() * 2))


In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion='entropy', max_features=None)
cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy') 
predictions = model.fit(train_feas, train_labels).predict(test_feas)
acc = accuracy_score(predictions, test_labels)
auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('Decision Tree')
print("Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)" % (acc, auc, ap, f1, cv.mean(), cv.std() * 2))


Decision Tree
Accuracy = 0.797, AUC = 0.797, AP = 0.746, F1 = 0.801, 10-Fold CV: 0.803 (+/- 0.088)


In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

model = RandomForestClassifier(n_estimators=200)
cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy') 
predictions = model.fit(train_feas, train_labels).predict(test_feas)
acc = accuracy_score(predictions, test_labels)
auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('Random Forest')
print("Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)" % (acc, auc, ap, f1, cv.mean(), cv.std() * 2))


Random Forest
Accuracy = 0.844, AUC = 0.844, AP = 0.785, F1 = 0.844, 10-Fold CV: 0.819 (+/- 0.074)


In [8]:
model = GradientBoostingClassifier(n_estimators=100)
cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy') 
predictions = model.fit(train_feas, train_labels).predict(test_feas)
acc = accuracy_score(predictions, test_labels)
auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('Gradient Boosting')
print("Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)" % (acc, auc, ap, f1, cv.mean(), cv.std() * 2))


Gradient Boosting
Accuracy = 0.847, AUC = 0.847, AP = 0.782, F1 = 0.846, 10-Fold CV: 0.862 (+/- 0.060)


In [9]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=200)
cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy') 
predictions = model.fit(train_feas, train_labels).predict(test_feas)
acc = accuracy_score(predictions, test_labels)
auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('Gradient Boosting')
print("Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)" % (acc, auc, ap, f1, cv.mean(), cv.std() * 2))


Gradient Boosting
Accuracy = 0.837, AUC = 0.837, AP = 0.782, F1 = 0.838, 10-Fold CV: 0.843 (+/- 0.078)


In [12]:
from sklearn.ensemble import VotingClassifier

knn = KNeighborsClassifier(n_neighbors=30)
mlp = MLPClassifier()
log = LogisticRegression(C=0.0001)
rbf = SVC(kernel='rbf', C=1, gamma=0.0001, probability=True)
tree = DecisionTreeClassifier(criterion='entropy', max_features=None)
rf = RandomForestClassifier(n_estimators=200)
gb = GradientBoostingClassifier(n_estimators=100)
ada = AdaBoostClassifier(n_estimators=200)

model = VotingClassifier(estimators=[('knn', knn), ('mlp', mlp), ('tree', tree), ('rf', rf),
                                    ('log', log), ('rbf', rbf), ('gb', gb), ('ada', ada)],
                        voting='soft',
                        weights=[2, 2, 3, 4, 2, 1, 5, 4])
# cv = cross_val_score(model, train_feas, train_labels, cv=10, scoring='accuracy')
model.fit(train_feas, train_labels)
predictions = model.predict(test_feas)
acc = accuracy_score(predictions, test_labels)
auc = roc_auc_score(predictions, test_labels)
ap = average_precision_score(predictions, test_labels)
f1 = f1_score(predictions, test_labels)
print('Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f'%(acc, auc, ap, f1))
# print('Accuracy = %.3f, AUC = %.3f, AP = %.3f, F1 = %.3f, 10-Fold CV: %0.3f (+/- %0.3f)'%(acc, auc, ap, f1, cv.mean(), cv.std() * 2))


Accuracy = 0.842, AUC = 0.843, AP = 0.769, F1 = 0.839


  if diff:
