In [22]:
# Importing Basic Libraries
import pandas as pd 
import numpy as np 

class NodeRegression:
    """
    Class to grow a regression decision tree
    """
    def __init__(
        self, 
        Y: list,
        X: pd.DataFrame,
        min_samples_split=None,
        max_depth=None,
        depth=None,
        node_type=None,
        rule=None
    ):
        # Initialize the node with data and hyperparameters
        
        # Store the target variable (Y) and features (X)
        self.Y = Y 
        self.X = X

        # Set hyperparameters with defaults if not provided
        self.min_samples_split = min_samples_split if min_samples_split else 20
        self.max_depth = max_depth if max_depth else 5

        # Initialize the current depth of the node
        self.depth = depth if depth else 0

        # Extract feature names
        self.features = list(self.X.columns)

        # Set the node type and rule for splitting
        self.node_type = node_type if node_type else 'root'
        self.rule = rule if rule else ""

        # Calculate the mean of Y
        self.ymean = np.mean(Y)

        # Calculate residuals (Y - mean(Y))
        self.residuals = self.Y - self.ymean

        # Calculate mean squared error (MSE) of the node
        self.mse = self.get_mse(Y, self.ymean)

        # Store the number of observations in the node
        self.n = len(Y)

        # Initialize left and right child nodes as empty nodes
        self.left = None 
        self.right = None 

        # Default values for splits
        self.best_feature = None 
        self.best_value = None 

    def get_mse(self, y_true, y_pred) -> float:
        """
        Method to calculate the mean squared error 
        """
        return np.mean((y_true - y_pred) ** 2)

    def ma(self, x: np.array, window: int) -> np.array:
        """
        Calculates the moving average of the given list. 
        """
        return np.convolve(x, np.ones(window), 'valid') / window

    def best_split(self) -> tuple:
        """
        Given the X features and Y targets, calculates the best split 
        for a decision tree
        """
        # Create a dataset for splitting
        df = self.X.copy()
        df['Y'] = self.Y

        # Get the MSE for the base input
        mse_base = self.mse

        # Default best feature and split
        best_feature = None
        best_value = None

        for feature in self.features:
            # Drop missing values and sort by feature
            Xdf = df.dropna().sort_values(feature)

            # Calculate rolling average of unique feature values
            xmeans = self.ma(Xdf[feature].unique(), 2)

            for value in xmeans:
                # Split data into left and right based on feature value
                left_y = Xdf[Xdf[feature] < value]['Y'].values
                right_y = Xdf[Xdf[feature] >= value]['Y'].values

                # Calculate means of left and right Y
                left_mean = np.mean(left_y)
                right_mean = np.mean(right_y)

                # Calculate residuals for left and right
                res_left = left_y - left_mean 
                res_right = right_y - right_mean

                # Concatenate residuals
                r = np.concatenate((res_left, res_right), axis=None)

                # Calculate MSE for the split
                n = len(r)
                r = r ** 2
                r = np.sum(r)
                mse_split = r / n

                # Check if this is the best split so far
                if mse_split < mse_base:
                    best_feature = feature
                    best_value = value 

                    # Set the best gain to the current one
                    mse_base = mse_split

        return (best_feature, best_value)

    def grow_tree(self):
        """
        Recursive method to create the decision tree
        """
        # Create a DataFrame from the data 
        df = self.X.copy()
        df['Y'] = self.Y

        # If there is a gain to be achieved and conditions are met, split further
        if (self.depth < self.max_depth) and (self.n >= self.min_samples_split):

            # Get the best split 
            best_feature, best_value = self.best_split()

            if best_feature is not None:
                # Save the best split to the current node 
                self.best_feature = best_feature
                self.best_value = best_value

                # Split data into left and right nodes
                left_df, right_df = df[df[best_feature] <= best_value].copy(), df[df[best_feature] > best_value].copy()

                # Create left and right child nodes
                left = NodeRegression(
                    left_df['Y'].values.tolist(), 
                    left_df[self.features], 
                    depth=self.depth + 1, 
                    max_depth=self.max_depth, 
                    min_samples_split=self.min_samples_split, 
                    node_type='left_node',
                    rule=f"{best_feature} <= {round(best_value, 3)}"
                )

                self.left = left 
                self.left.grow_tree()

                right = NodeRegression(
                    right_df['Y'].values.tolist(), 
                    right_df[self.features], 
                    depth=self.depth + 1, 
                    max_depth=self.max_depth, 
                    min_samples_split=self.min_samples_split,
                    node_type='right_node',
                    rule=f"{best_feature} > {round(best_value, 3)}"
                )

                self.right = right
                self.right.grow_tree()

    def print_info(self, width=4):
        """
        Method to print information about the tree
        """
        # Define the number of spaces 
        const = int(self.depth * width ** 1.5)
        spaces = "-" * const
        
        if self.node_type == 'root':
            print("Root")
        else:
            print(f"|{spaces} Split rule: {self.rule}")
        print(f"{' ' * const}   | MSE of the node: {round(self.mse, 2)}")
        print(f"{' ' * const}   | Count of observations in node: {self.n}")
        print(f"{' ' * const}   | Prediction of node: {round(self.ymean, 3)}")   

    def print_tree(self):
        """
        Prints the entire tree from the current node to the bottom
        """
        self.print_info() 
        
        if self.left is not None: 
            self.left.print_tree()
        
        if self.right is not None:
            self.right.print_tree()


In [23]:
# Randomly shuffle the samples in X and y
def shuffle_data(X, y, seed=None):
    """ Random shuffle of the samples in X and y """
    # Set a seed for reproducibility if provided
    if seed:
        np.random.seed(seed)
    
    # Generate an array of indices and shuffle them
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)

    try:
        # If X and y are numpy arrays, shuffle them using the shuffled indices
        return X[idx], y[idx]
    except:
        # If X and y are pandas DataFrames or Series, shuffle them using the shuffled indices
        return X.iloc[idx], y.iloc[idx]

# Split the data into training and testing sets
def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
    """ Split the data into train and test sets """
    if shuffle:
        # Shuffle the data using the shuffle_data function
        X, y = shuffle_data(X, y, seed)
    
    # Calculate the index for splitting the data based on the test_size
    split_i = len(y) - int(len(y) // (1 / test_size))
    
    # Split the data into training and testing sets
    X_train, X_test = X[:split_i], X[split_i:]
    y_train, y_test = y[:split_i], y[split_i:]

    return X_train, X_test, y_train, y_test


In [24]:
# Specify the path
data_path = "./EPL_Soccer_MLR_LR.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(data_path)

# Remove rows with all missing values (NaN)
df.dropna(axis=0, how='all', thresh=None, subset=None, inplace=True)

# Select only the numeric columns (dropping categorical columns)
new_df = df.select_dtypes(['number'])


In [25]:
# Selecting columns with numeric data types (integers or floats)
new_df = df.select_dtypes(['number'])


In [26]:
# Extracting independent features (all columns except the last one)
X = new_df.iloc[:, :-1]

# Extracting the dependent variable (the last column - Score)
y = new_df.iloc[:, -1]


In [27]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, seed=42)

In [28]:
# Calculation of Correlated Matrix
correlated_features = set()
correlation_matrix = X.corr()

In [29]:
# Finding correlated columns

# Initialize a set to correlated column names
correlated_features = set()

# Loop through the columns of the correlation matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        # Check for value is greater than 0.8
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            # If true add the column name to the set
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

# Display Correlated features           
correlated_features

{'Height', 'MinutestoGoalRatio', 'ShotsPerGame', 'Weight'}

In [None]:
# Dropping correlated features

# training data (X_train)
X_train.drop(columns=correlated_features, axis=1, inplace=True)

# test data (X_test)
X_test.drop(columns=correlated_features, axis=1, inplace=True)

# Original feature matrix (X)
X.drop(columns=correlated_features, axis=1, inplace=True)


In [31]:
# Creating the root node
root = NodeRegression(y_train, X_train, max_depth=2, min_samples_split=3)

In [32]:
root

<__main__.NodeRegression at 0x1ea0c2d9ee0>

In [33]:
# Growing the tree recursively
root.grow_tree()

In [34]:
# Printing tree 
root.print_tree()

Root
   | MSE of the node: 35.87
   | Count of observations in node: 162
   | Prediction of node: 13.587
|-------- Split rule: Cost <= 68.05
           | MSE of the node: 5.33
           | Count of observations in node: 93
           | Prediction of node: 9.187
|---------------- Split rule: Cost <= 44.65
                   | MSE of the node: 1.8
                   | Count of observations in node: 44
                   | Prediction of node: 7.431
|---------------- Split rule: Cost > 44.65
                   | MSE of the node: 3.24
                   | Count of observations in node: 49
                   | Prediction of node: 10.764
|-------- Split rule: Cost > 68.05
           | MSE of the node: 15.78
           | Count of observations in node: 69
           | Prediction of node: 19.516
|---------------- Split rule: Cost <= 109.3
                   | MSE of the node: 6.13
                   | Count of observations in node: 50
                   | Prediction of node: 17.8
|--------------

<hr>

In [35]:
# Load the dataset from the specified path
data_path = "./EPL_Soccer_MLR_LR.csv"
df = pd.read_csv(data_path)

# Drop rows with all null values
df.dropna(axis=0, how='all', inplace=True)

# Select only numerical columns as the categorical ones are not needed
new_df = df.select_dtypes(['number'])

# Split the dataset into independent variables (X) and the dependent variable (y)
X = new_df.iloc[:, :-1]
y = new_df.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, seed=42)

# Find highly correlated features
correlated_features = set()
correlation_matrix = X.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

# Remove correlated features from the training and testing data
X_train.drop(columns=correlated_features, axis=1, inplace=True)
X_test.drop(columns=correlated_features, axis=1, inplace=True)
X.drop(columns=correlated_features, axis=1, inplace=True)


In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

# Create a DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)

# Perform cross-validation with 5 folds and calculate the scores
cross_val_score(regressor, X, y, cv=5)


array([0.46196735, 0.74246175, 0.87827764, 0.73471287, 0.82410885])

In [16]:
acc = [0.46196735, 0.74246175, 0.87827764, 0.73471287, 0.82410885]

print("Accuracy of model : ", np.mean(acc))

Accuracy of model :  0.728305692


<hr>