In [2]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ShuffleSplit
from deep_forest import MGCForest
import pandas as pd
import numpy as np
from time import time

def preprocess_features(X):
	"""
	Preprocesses the student data and converts non-numeric binary variables into
	binary (0/1) variables. Converts categorical variables into dummy variables.
	"""
	# Initialize new output DataFrame
	output = pd.DataFrame(index=X.index)
	# Investigate each feature column for the data
	for col, col_data in X.iteritems():
		# If data type is non-numeric, replace all yes/no values with 1/0
		if col_data.dtype == object:
			col_data = col_data.replace(['yes', 'no'], [1, 0])
		# If data type is categorical, convert to dummy variables
		if col_data.dtype == object:
			# Example: 'school' => 'school_GP' and 'school_MS'
			col_data = pd.get_dummies(col_data, prefix=col)
		# Collect the revised columns
		output = output.join(col_data)
	return output

if __name__ == '__main__':

	# Read student data
	student_data = pd.read_csv("student-data.csv")
	# print ("Student data read successfully!")
	feature_cols = list(student_data.columns[:-1])
	target_col = student_data.columns[-1]
	X_all = student_data[feature_cols]
	y_all = student_data[target_col]
	X_all = preprocess_features(X_all)
	# print ("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))
	# print (X_all.shape)

	rs = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
	for train_index, test_index in rs.split(X_all):
		X_train = X_all.iloc[train_index]
		X_test = X_all.iloc[test_index]
		y_train = y_all.iloc[train_index]
		y_test = y_all.iloc[test_index]

	X_train = np.array(X_train)
	X_test = np.array(X_test)
	y_train = np.array(y_train)
	y_test = np.array(y_test)

	mgc_forest = MGCForest(
	    estimators_config={
	        'mgs': [{
	            'estimator_class': ExtraTreesClassifier,
	            'estimator_params': {
	                'n_estimators': 3,
	                'min_samples_split': 11,
	                'n_jobs': -1,
	            }
	        }, {
	            'estimator_class': RandomForestClassifier,
	            'estimator_params': {
	                'n_estimators': 3,
	                'min_samples_split': 11,
	                'n_jobs': -1,
	            }
	        }],
	        'cascade': [{
	            'estimator_class': ExtraTreesClassifier,
	            'estimator_params': {
	                'n_estimators': 100,
	                'min_samples_split': 6,
	                'max_features': 1,
	                'n_jobs': -1,
	            }
	        }, {
	            'estimator_class': ExtraTreesClassifier,
	            'estimator_params': {
	                'n_estimators': 100,
	                'min_samples_split': 6,
	                'max_features': 'sqrt',
	                'n_jobs': -1,
	            }
	        }, {
	            'estimator_class': RandomForestClassifier,
	            'estimator_params': {
	                'n_estimators': 100,
	                'min_samples_split': 6,
	                'max_features': 1,
	                'n_jobs': -1,
	            }
	        }, {
	            'estimator_class': RandomForestClassifier,
	            'estimator_params': {
	                'n_estimators': 100,
	                'min_samples_split': 6,
	                'max_features': 'sqrt',
	                'n_jobs': -1,
	            }
	        }]
	    },
	    stride_ratios=[1.0/4, 1.0/8],
	)
	start = time()
	mgc_forest.fit(X_train, y_train)
	end = time()
	print ('fit time is: ', end - start)

ImportError: No module named 'deep_forest'

In [3]:
y_pred = mgc_forest.predict(X_test)

print('Prediction shape:', y_pred.shape)
print(
    'Accuracy:', accuracy_score(y_test, y_pred),
    'F1 score:', f1_score(y_test, y_pred, average='weighted')
)

NameError: name 'mgc_forest' is not defined