    Ben Christensen
    Math 347
    3/10/18

Use the page rank algorithm to predict the top 6 NCAA teams based on 2013 games

In [1]:
import numpy as np
from scipy.sparse import dok_matrix
from scipy import linalg as la
import pdb

In [2]:
# Problem 1
def to_matrix(filename, n):
    """Return the nxn adjacency matrix described by datafile.

    Parameters:
        datafile (str): The name of a .txt file describing a directed graph.
        Lines describing edges should have the form '<from node>\t<to node>\n'.
        The file may also include comments.
    n (int): The number of nodes in the graph described by datafile

    Returns:
        A SciPy sparse dok_matrix.
    """
    A = dok_matrix((n,n))
    #Create the adjacency matrix
    with open(filename, 'r') as myfile:
        for line in myfile:
            try:
                line = line.strip().split()
                A[int(line[0]), int(line[1])] += 1
            except:
                print("Line skipped:", line)

    return A


# Problem 2
def calculateK(A,N):
    """Compute the matrix K as described in the lab.

    Parameters:
        A (ndarray): adjacency matrix of an array
        N (int): the datasize of the array

    Returns:
        K (ndarray)
    """
    #Change sink's row to all ones
    for i, row in enumerate(A):
        if not row.any():
            A[i] = np.ones_like(A[i])
    #Calculate the diagonals of the D matrix
    D = np.sum(A, axis=1)

    return (A / D.reshape((-1, 1))).T


# Problem 3
def iter_solve(adj, N=None, d=.85, tol=1E-5):
    """Return the page ranks of the network described by 'adj'.
    Iterate through the PageRank algorithm until the error is less than 'tol'.

    Parameters:
        adj (ndarray): The adjacency matrix of a directed graph.
        N (int): Restrict the computation to the first 'N' nodes of the graph.
            If N is None (default), use the entire matrix.
        d (float): The damping factor, a float between 0 and 1.
        tol (float): Stop iterating when the change in approximations to the
            solution is less than 'tol'.

    Returns:
        The approximation to the steady state.
    """
    if N == None:
        N = adj.shape[0]
    p = np.random.random(N)
    K = calculateK(adj, N)
    #Normalize initial guess for p_0
    p = p / la.norm(p)
    p1 = d*K@p + ((1-d)/N)*np.ones(N)
    while la.norm(p1 - p) > tol:
        p = p1
        p1 = d*K@p + ((1-d)/N)*np.ones(N)
    return p1



# Problem 4
def eig_solve(adj, N=None, d=.85):
    """Return the page ranks of the network described by 'adj'. Use SciPy's
    eigenvalue solver to calculate the steady state of the PageRank algorithm

    Parameters:
        adj (ndarray): The adjacency matrix of a directed graph.
        N (int): Restrict the computation to the first 'N' nodes of the graph.
            If N is None (default), use the entire matrix.
        d (float): The damping factor, a float between 0 and 1.

    Returns:
        The approximation to the steady state.
    """
    if N == None:
        N = adj.shape[0]
    K = calculateK(adj, N)
    B = d*K + ((1-d)/N)*np.ones((N,N))

    return la.eig(B)[1][:,0] / np.sum(la.eig(B)[1][:,0])


# Problem 5
def team_rank(filename='/Users/benchristensen/Desktop/ACME Python Labs/Volume1-Student-Materials/PageRank/ncaa2013.csv'):
    """Use iter_solve() to predict the rankings of the teams in the given
    dataset of games. The dataset should have two columns, representing
    winning and losing teams. Each row represents a game, with the winner on
    the left, loser on the right. Parse this data to create the adjacency
    matrix, and feed this into the solver to predict the team ranks.

    Parameters:
        filename (str): The name of the data file.
    Returns:
        ranks (list): The ranks of the teams from best to worst.
        teams (list): The names of the teams, also from best to worst.
    """

    teams = set()
    matches = []
    #Read in matches and create a set of every team name
    with open(filename, 'r') as ncaafile:
        ncaafile.readline()
        for line in ncaafile:
            match = line.strip().split(',')
            matches.append(match)
            teams.add(match[1])
            teams.add(match[0])
        #Create a dictionary with a number for every team name
        teams = list(teams)
        n = len(teams)
        dictionary = dict(zip(teams, np.arange(n)))
        A = dok_matrix((n,n))
    for match in matches:
        A[dictionary[match[1]], dictionary[match[0]]] = 1
        #Solve for the ranking of the teams
    ranks = iter_solve(A.toarray(), d=.7)
    mask = np.flip(np.argsort(ranks), axis=0)
    return ranks[mask], list(np.array(list(teams))[mask])



In [3]:
A = to_matrix("/Users/benchristensen/Desktop/ACME Python Labs/Volume1-Student-Materials/PageRank/matrix.txt", 8)
K = calculateK(A.toarray(), 8)
print(iter_solve(A.toarray()))
print(eig_solve(A.toarray()))
ranks, teams = team_rank()
print(teams[0:6])

Line skipped: ['From', 'Node', 'To', 'Node']
[ 0.43870689  0.02171029  0.02786154  0.02171029  0.02171029  0.02786154
  0.04585394  0.39460775]
[ 0.43869288  0.02171029  0.02786154  0.02171029  0.02171029  0.02786154
  0.04585394  0.39459924]
['Duke', 'Butler', 'Louisville', 'Illinois', 'Indiana', 'Miami FL']


In [4]:
pwd

'/Users/benchristensen/Desktop/ACME Python Labs/Volume1-Student-Materials/ComplexFunctions'