# Attempt at making an LSTM

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# pytorch for lstm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

## Loading the data

#### Game Data

In [3]:
game_data_path = "data/final_game_data/"
files = os.listdir(game_data_path)
season = pd.read_csv(game_data_path + files[0])

#### Odds Data

In [4]:
odds_data_path = "data/odds_data_processed/"
odds_files = os.listdir(odds_data_path)

In [5]:
odds = pd.read_csv(odds_data_path + odds_files[1])

In [6]:
odds.shape

(1288, 14)

In [7]:
odds.sample(5)

Unnamed: 0.1,Unnamed: 0,Date,Home,Away,OU,Spread,OU_2H,Spread_2H,ML_home,ML_away,Points,Win Margin,2H Points,2H Win Margin
151,151,2007-11-21,Miami,Atlanta,190.0,7.5,93.5,3.0,-430,360,161,-3,81,-9
1056,1056,2008-03-28,Seattle,Charlotte,208.0,4.0,102.0,1.0,120,-140,189,-3,81,-1
488,488,2008-01-08,Chicago,New York,190.0,6.5,96.5,5.0,-290,240,205,-5,100,-4
822,822,2008-02-27,Atlanta,Sacramento,203.0,3.5,103.0,3.0,-240,200,240,6,123,5
118,118,2007-11-16,Atlanta,Seattle,206.0,7.5,100.5,5.5,-275,235,249,-3,111,13


#### Reconciling names

In [8]:
season_names = {'Golden State Warriors':'GSW',
                'Los Angeles Lakers': 'LAL',
                'San Antonio Spurs': 'SAS',
                'Cleveland Cavaliers': 'CLE',
                'Denver Nuggets': 'DEN',
                'Indiana Pacers': 'IND',
                'Memphis Grizzlies': 'MEM',
                'New Jersey Nets': 'BRK',
                'Brooklyn Nets': 'BRK',
                'New Orleans Hornets': 'NOP',
                'New Orleans Pelicans': 'NOP',
                'Orlando Magic': 'ORL',
                'Toronto Raptors': 'TOR',
                'Miami Heat': 'MIA',
                'Seattle SuperSonics': 'SEA',
                'Utah Jazz': 'UTA',
                'Atlanta Hawks': 'ATL',
                'Boston Celtics': 'BOS',
                'Charlotte Bobcats': 'CHA',
                'Charlotte Hornets': 'CHA',
                'Chicago Bulls': 'CHI',
                'Los Angeles Clippers': 'LAC',
                'Minnesota Timberwolves': 'MIN',
                'Phoenix Suns': 'PHO',
                'Dallas Mavericks': 'DAL',
                'Houston Rockets': 'HOU',
                'Milwaukee Bucks': 'MIL',
                'Philadelphia 76ers': 'PHI',
                'Washington Wizards': 'WAS',
                'Detroit Pistons': 'DET',
                'New York Knicks': 'NYK',
                'Sacramento Kings': 'SAC',
                'Portland Trail Blazers': 'POR',
                'Oklahoma City Thunder': 'OKC'
        }

In [9]:
odds_names = {}
for name in list(pd.unique(odds.Home)):
    found = False
    for s_name in season_names:
        if name in s_name:
            found = True
            odds_names[name] = season_names[s_name]
    if not found: print(name)

LA Lakers
LA Clippers


In [10]:
odds_names["LA Lakers"] = "LAL"
odds_names["LA Clippers"] = "LAC"
odds_names["Okla City"] = "OKC"

In [11]:
odds["Home"] = odds["Home"].apply(lambda x: odds_names[x])

In [12]:
odds["Away"] = odds["Away"].apply(lambda x: odds_names[x])

In [13]:
season["team"] = season["team"].apply(lambda x: season_names[x])
season["opponent"] = season["opponent"].apply(lambda x: season_names[x])

### Merging the two tables

In [14]:
def make_index(row, col1, col2, col3):
    return str(row[col1]) + str(row[col2]) + str(row[col3])

In [15]:
season["date"] = season["date"].apply(lambda x: str(x)[:-1])

In [16]:
season["Index"] = season.apply(lambda x: make_index(x, "date", "team", "opponent"), axis=1)

In [17]:
odds["Date"] = odds["Date"].apply(lambda x: "".join(x.split("-")))

In [18]:
odds["Index"] = odds.apply(lambda x: make_index(x, "Date", "Home", "Away"), axis=1)

In [19]:
merged = pd.merge(odds, season, on='Index')

In [20]:
merged = merged.drop(["Unnamed: 0_x", "Unnamed: 0_y", "date", "Home", "Away", "index"], axis = 1)

In [21]:
merged.sample(1)

Unnamed: 0,Date,OU,Spread,OU_2H,Spread_2H,ML_home,ML_away,Points,Win Margin,2H Points,...,opponent_HOB,opponent_STL,opponent_TRB,opponent_FTA,opponent_BLK,opponent_FTr,opponent_TS%,opponent_FT/FGA,opponent_3P%,home
275,20071208,187.5,5.0,93.0,5.5,-200,170,182,-28,96,...,1.604651,14.0,47.0,22.0,5.0,0.272,0.578959,0.148148,0.538462,1


In [22]:
odds_cols = ["OU", "Spread", "OU_2H", "Spread_2H", "ML_home", "ML_away"]
labels = ["Points", "Win Margin", "2H Points", "2H Win Margin"]
label_index = [0, 1, 2, 3]
non_numeric = ['Date','Home', 'Away']

## Format Data from LSTM (using season-data)

In [23]:
# drop non numeric features

In [24]:
labels = ["Points", "Win Margin", "2H Points", "2H Win Margin"]
label_index = [0, 1, 2, 3]
non_numeric = ['index']

In [25]:
data = season.set_index("Index")
data = data.drop(non_numeric, axis = 1)

In [26]:
data.shape

(2632, 97)

In [27]:
for team_name in list(season_names.values()):
    m = data[data.index.str.contains(team_name)]
    if m.shape[0] > 0: break

In [28]:
m = m.sort_index()
label_col = 0
start_data_cols = 4
rows = m.shape[0]
N_PREV = 3

In [29]:
X = []
y = []

current_data = []

for r in range(rows):
    if len(current_data) == N_PREV:
        X.append(current_data)
        y.append(m.iloc[r].values[label_col])
        
    row = m.iloc[r].values[start_data_cols:]
    current_data.append(row)
    if len(current_data) > N_PREV:
        current_data.pop(0)

X = np.array(X, dtype=float)
y = np.array(y, dtype=float)

In [30]:
X.shape

(161, 3, 93)

In [31]:
y.shape

(161,)

In [32]:
m.shape

(164, 97)

## LSTM