# Decision tree, random forest, extra tree implementation, with logistic regression as a baseline

Import the packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, root_mean_squared_error
from sklearn.model_selection import train_test_split, StratifiedKFold

Import the data and set up the features, cross-validation, and metrics

In [None]:
df_train = pd.read_csv("train.csv")

features = ["user_edit_count", "user_warns", "num_recent_reversions", "num_edits_5d_before", "is_person"]

num_splits = 5
num_models = 4
kfold = StratifiedKFold(num_splits, random_state=216, shuffle=True)

## This array will hold the mse for each model and split. Change to other metrics as needed.
rmses = np.zeros((num_models, num_splits))

## This array will hold the accuracy scores.
accs = np.zeros(num_models)

Fit the models and record the metrics

In [None]:
## sets a split counter to help record the metrics
i = 0

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[features], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    log_reg = LogisticRegression(penalty=None, max_iter=500) # Note that sufficient iteration is needed
    log_reg.fit(df_tt[features], df_tt.isvandalism)
    log_pred = log_reg.predict(df_ho[features])

    rmses[0, i] = root_mean_squared_error(df_ho.isvandalism, log_pred)


    tree = DecisionTreeClassifier(
        #max_depth = 10, 
        min_samples_leaf = 5, # minimum number of samples in each leaf, to prevent overfitting
        random_state= 216
        )
    tree.fit(df_tt[features], df_tt.isvandalism)
    tree_pred = tree.predict(df_ho[features])

    rmses[1, i] = root_mean_squared_error(df_ho.isvandalism, tree_pred)


    rf = RandomForestClassifier(
        n_estimators = 500, # number of trees in ensemble
        #max_depth = 10, # max_depth of each tree
        min_samples_leaf = 5, 
        #max_features = 2, # default is round(sqrt(num_features))
        bootstrap= True, # sampling with replacement
        max_samples = 500, # number of training samples selected with replacement to build tree
        random_state = 216 # for consistency
        )
    
    rf.fit(df_tt[features], df_tt.isvandalism)
    rf_pred = rf.predict(df_ho[features])

    rmses[2, i] = root_mean_squared_error(df_ho.isvandalism, rf_pred)

    et = ExtraTreesClassifier(
        n_estimators = 500, 
        #max_depth = 10, 
        min_samples_leaf = 5, 
        #max_features = 2, 
        bootstrap= True, 
        max_samples = 500, 
        random_state = 216 
        )
    
    et.fit(df_tt[features], df_tt.isvandalism)
    et_pred = et.predict(df_ho[features])

    rmses[3, i] = root_mean_squared_error(df_ho.isvandalism, et_pred)

    acc = np.array([accuracy_score(df_ho.isvandalism, log_pred), accuracy_score(df_ho.isvandalism, tree_pred),  accuracy_score(df_ho.isvandalism, rf_pred), accuracy_score(df_ho.isvandalism, et_pred)])
    accs = accs + acc

    score_df = pd.DataFrame({'feature':df_tt[features].columns,
                            'importance_score': rf.feature_importances_})

    score_df.sort_values('importance_score',ascending=False)
    print(score_df)
    

    i = i + 1

# Calculate the average accuracy scores over the splits
accs = accs / num_splits

Print out (average) accuracy scores

In [None]:
print(pd.DataFrame(accs, index= ['log', 'tree', 'rf', 'et'], columns = ['avg_accuracy']))

Print out the RMSEs (change to other metrics if needed)

In [None]:
print(f"Logistic Regression Avg. CV RMSE: {np.mean(rmses[0,:])} and STD: {np.std(rmses[0,:])}")
print(f"Decision Tree Avg. CV MSE: {np.mean(rmses[1,:])} and STD: {np.std(rmses[1,:])}")
print(f"Random Forest Avg. CV MSE: {np.mean(rmses[2,:])} and STD: {np.std(rmses[2,:])}")
print(f"Extra Tree Avg. CV MSE: {np.mean(rmses[3,:])} and STD: {np.std(rmses[3,:])}")