In [None]:
import psycopg2
from dotenv import load_dotenv
import numpy as np
import os

# Data Handling

In [None]:
def sql_connection():
    """
    Creates a connection to the postgresql server
    """
    
    load_dotenv()

    # Accessing variables from .env file
    db_name = os.getenv('DB_NAME')
    db_user = os.getenv('DB_USER')
    db_pass = os.getenv('DB_PASS')
    db_host = os.getenv('DB_HOST')
    db_port = os.getenv('DB_PORT')
    
    try:
        conn = psycopg2.connect(
            dbname=db_name,
            user=db_user,
            password=db_pass,
            host=db_host,
            port=db_port
        )
        print("Connected to the database.")
        return (conn, conn.cursor())
    except Exception as e:
        print("Unable to connect to the database.")
        print(e)
        return None

In [None]:
def process_stat_list(stat_list):
    """
    Takes in a list of stats and converts it to a string to be used in
    a SQL query. 
    """
    return ', '.join(stat_list)
    

def load_data(cursor, stats, start_year, end_year):
    """
    Loads data from the postgresql database, returning numpy matrix of player stats
    and numpy array of corresponding ratings of the players.
    
    stats: list containing stats that the model should train on
    """
    stats = process_stat_list(stats)
    query = f'SELECT {stats} FROM "PlayerStats" JOIN "Players" ON "PlayerID" WHERE "DraftYear" >= {start_year} AND "DraftYear" <= {end_year} ORDER BY "PlayerID";'
    cursor.execute(query)
    records = cursor.fetchall()
    return np.array(records)

def load_ratings(cursor, start_year, end_year):
    """
    Returns ratings of players in the database
    """
    query = f'SELECT "Rating" FROM "Players" WHERE "DraftYear" >= {start_year} AND "DraftYear" <= {end_year} ORDER BY "PlayerID";'
    cursor.execute(query)
    records = cursor.fetchall()
    return np.array(records).flatten()

In [15]:
load_ratings(cursor, 2013, 2019)

array([0, 4, 1, 1, 1, 1, 2, 1, 4, 1, 3, 2, 0, 0, 1, 1, 2, 1, 2, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 5, 3, 3, 4,
       1, 1, 1, 2, 4, 0, 0, 0, 2, 0, 0, 2, 1, 0, 2, 0, 0, 0, 3, 3, 0, 0,
       1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1, 1, 1, 0, 3, 2, 5, 2, 3, 0,
       0, 0, 2, 0, 2, 1, 3, 0, 0, 2, 2, 0, 0, 1, 0, 2, 0, 0, 3, 0, 0, 0,
       0, 4, 4, 4, 1, 3, 4, 0, 3, 4, 2, 0, 0, 0, 2, 2, 0, 0, 4, 3, 1, 0,
       0, 0, 3, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 2, 4,
       2, 2, 2, 5, 0, 2, 1, 3, 3, 0, 0, 3, 0, 4, 3, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 3, 5, 3, 1, 4, 5, 3, 0, 2,
       2, 1, 2, 0, 1, 1, 1, 0, 0, 2, 5, 0, 0, 2, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 5, 3, 4, 0, 3, 1, 2, 1, 3, 4, 0, 0, 1, 2, 2, 2, 1, 1, 0, 0,
       3, 3, 0, 1, 1, 2, 0, 0, 1, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 2, 2, 2, 1, 2, 2, 0, 0, 0, 3, 2, 0, 2, 0, 0, 0, 5, 4, 0, 0,
       0, 1, 1, 2, 0, 2, 3, 2, 5, 2, 2, 3])

In [None]:
cursor.execute('SELECT "PlayerID", "PlayerName", "Rating" FROM "Players" ORDER BY "PlayerID";')
records = cursor.fetchall()
for record in records:
    print(record)

# Decision Trees

In [None]:
class TreeNode(object):
    """Tree class.
    
    (You don't need to add any methods or fields here but feel
    free to if you like. Our tests will only reference the fields
    defined in the constructor below, so be sure to set these
    correctly.)
    """
    
    def __init__(self, left, right, parent, cutoff_id, cutoff_val, prediction):
        self.left = left
        self.right = right
        self.parent = parent
        self.cutoff_id = cutoff_id
        self.cutoff_val = cutoff_val
        self.prediction = prediction

In [None]:
def gini_impurity_weighted(class_counts, total_weight):
    """
    Calculates the weighted Gini impurity given the weighted counts of each class.
    
    Args:
        class_counts: Array-like, containing the sum of the weights for each class.
        total_weight: The sum of the weights of all instances.
    Returns:
        float: The weighted Gini impurity.
    """
    # Calculate the squared proportion for each class
    class_probs = (class_counts / total_weight) ** 2

    # Calculate the weighted Gini impurity
    impurity = 1 - np.sum(class_probs)
    return impurity

def sqsplit(xTr,yTr, classes, weights=[]):
    """Finds the best feature, cut value, and loss value.
    
    Input:
        xTr:     n x d matrix of data points
        yTr:     n-dimensional vector of labels
        classes: number of classes
        weights: n-dimensional weight vector for data points
    
    Output:
        feature:  index of the best cut's feature
        cut:      cut-value of the best cut
        bestloss: loss of the best cut
    """
    N,D = xTr.shape
    assert D > 0 # must have at least one dimension
    assert N > 1 # must have at least two samples
    if weights == []: # if no weights are passed on, assign uniform weights
        weights = np.ones(N)
    weights = weights/sum(weights) # Weights need to sum to one (we just normalize them)
    bestgain = -np.inf
    feature = None
    cut = None
    counts = np.bincount(yTr, weights=weights, minlength = classes)
    parent_impurity= gini_impurity_weighted(counts, 1.0)
    
    for d in range(D):
        ii = xTr[:,d].argsort() # sort data along the dth dimension
        xs = xTr[ii,d] # sorted feature values
        ws = weights[ii] # sorted weights
        ys = yTr[ii] # sorted labels
        # np.finfo(float).eps gives us the smallest possible positive number that can be represented by floats. 
        idif = np.where(np.abs(np.diff(xs, axis=0)) > np.finfo(float).eps * 100)[0]

        for j in idif:
            if j+1 != len(yTr):
                left_ys = ys[:j+1]
                right_ys = ys[j+1:]
                left_ws = ws[:j+1]
                right_ws = ws[j+1:]

                # Calculate weighted class counts for left and right
                left_counts = np.bincount(left_ys, weights=left_ws, minlength = classes)
                right_counts = np.bincount(right_ys, weights=right_ws, minlength = classes)

                left_weight = np.sum(left_ws)
                right_weight = np.sum(right_ws)

                left_impurity = gini_impurity_weighted(left_counts, left_weight)
                right_impurity = gini_impurity_weighted(right_counts, right_weight)

                weighted_impurity = left_weight * left_impurity + right_weight * right_impurity`

                gain = parent_impurity - weighted_impurity

                if gain > bestgain:
                    cut = (xs[j] + xs[j + 1]) / 2.0
                    feature = d
                    bestgain = gain

    if feature is None or cut is None:
        return None, None, -np.inf

    return feature, cut, bestgain

In [None]:
def cart(xTr,yTr, classes, depth=np.inf,weights=None):
    """Builds a CART tree.
    
    The maximum tree depth is defined by "maxdepth" (maxdepth=2 means one split).
    Each example can be weighted with "weights".

    Args:
        xTr:      n x d matrix of data
        yTr:      n-dimensional vector
        maxdepth: maximum tree depth
        weights:  n-dimensional weight vector for data points

    Returns:
        tree: root of decision tree
    """
    n,d = xTr.shape
    if weights is None:
        w = np.ones(n) / float(n)
    else:
        w = weights
    
    def majority_class(labels, classes, weights):
        class_counts = np.bincount(labels, weights=weights, minlength=classes)
        return np.argmax(class_counts)
    
    # TODO:
    def treeRecursion(xTr, yTr, depth, weights):
        # Base case: create a leaf node
        if depth == 0 or len(yTr) <= 1:
            # weighted_average = np.mean(yTr) if np.sum(weights) == 0 else np.divide(np.sum(yTr * weights), np.sum(weights))
            return TreeNode(None, None, None, None, None, majority_class(yTr, classes, weights))

        # Find the best split
        feature, cut, _ = sqsplit(xTr, yTr, classes, weights)
 
        # Check if a valid split is found
        if feature is None or not isinstance(feature, int):
            # weighted_average = np.mean(yTr) if np.sum(weights) == 0 else np.divide(np.sum(yTr * weights), np.sum(weights))
            return TreeNode(None, None, None, None, None, majority_class(yTr, classes, weights))

        # Partition the data
        left_mask = xTr[:, feature] <= cut
        right_mask = xTr[:, feature] > cut

        # Check for empty splits
        if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
            # weighted_average = np.mean(yTr) if np.sum(weights) == 0 else np.divide(np.sum(yTr * weights), np.sum(weights))
            return TreeNode(None, None, None, None, None, majority_class(yTr, classes, weights))

        # Recursive calls for left and right children
        left_child = treeRecursion(xTr[left_mask], yTr[left_mask], depth - 1, weights[left_mask])
        right_child = treeRecursion(xTr[right_mask], yTr[right_mask], depth - 1, weights[right_mask])

        # Create the current node
        #weighted_average = np.mean(yTr) if np.sum(weights) == 0 else np.divide(np.sum(yTr * weights), np.sum(weights))
        node = TreeNode(left_child, right_child, None, feature, cut, majority_class(yTr, classes, weights))

        # Update parent for children nodes
        if left_child is not None:
            left_child.parent = node
        if right_child is not None:
            right_child.parent = node

        return node

    return treeRecursion(xTr, yTr, depth , w)

In [None]:
#<GRADED>
def evaltreehelper(root,xTe, idx=[]):
    """Evaluates xTe using decision tree root.
    
    Input:
        root: TreeNode decision tree
        xTe:  n x d matrix of data points
    
    Output:
        pred: n-dimensional vector of predictions
    """
    assert root is not None
    n = xTe.shape[0]
    pred = np.zeros(n)
    
    # TODO:
    for i in range(n):
        data_point = xTe[i]
        node = root
        while node is not None:
            if node.cutoff_id is None:  # Check if it is a leaf node
                pred[i] = node.prediction
                break

            feature_id = node.cutoff_id
            c = node.cutoff_val

            if data_point[feature_id] <= c:
                node = node.left
            else:
                node = node.right

            if node is None:  # If no more children, use parent's prediction
                pred[i] = node.prediction

    return pred    

def evaltree(root,xTe):
    """Evaluates xTe using decision tree root.
    
    Input:
        root: TreeNode decision tree
        xTe:  n x d matrix of data points
    
    Output:
        pred: n-dimensional vector of predictions
    """
    # TODO:
    return evaltreehelper(root, xTe)

In [None]:
conn, cursor = sql_connection()

In [None]:
stats = ["Games", "MPG", "FGP", "3PP", "3P", "3PA", "FG", "FGA", "FT", "FTA", "FTP", "RPG", "APG", "SPG", "BPG", ""]
xTr = load_data(cursor, )