In [None]:
%reload_ext nb_black

In [None]:
import pickle

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import LeaveOneOutEncoder

# Replace with whatever model import(s) you're using

from xgboost import XGBClassifier

In [None]:
data_url = "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Clustering/nba_player_seasons.csv"
nba = pd.read_csv(data_url).dropna()
nba.head(3)

In [None]:
nba = nba.drop("Player", 1)

In [None]:
labels = nba["Pos"].drop_duplicates().reset_index(drop=True)
label_map = {pos: i for i, pos in labels.iteritems()}

In [None]:
X = nba.drop(["Pos"], 1)
y = nba["Pos"].replace(label_map)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
nba.select_dtypes("O").columns

In [None]:
# Fill out your column datatypes here
num_cols = [
    "Seas",
    "Age",
    "G",
    "GS",
    "MP",
    "FG",
    "FGA",
    "FG%",
    "3P",
    "3PA",
    "3P%",
    "2P",
    "2PA",
    "2P%",
    "eFG%",
    "FT",
    "FTA",
    "FT%",
    "ORB",
    "DRB",
    "TRB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PTS",
]
bin_cols = []
cat_cols = ["Tm"]
drop_cats = []
preprocessing = ColumnTransformer(
    [
        # Should only use one of these
        # Comment out or delete one of the below 2 lines
        #     ('OneHotEncoder', OneHotEncoder(drop=drop_cats), cat_cols),
        ("leaveoneoutencoder", LeaveOneOutEncoder(), cat_cols),
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
        # bin_cols we'll leave untouch
    ],
    remainder="passthrough",
)
pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("pca", PCA()),
        # Choose your model and put it here
        ("model", XGBClassifier()),
    ]
)
grid = {
    # PCA
    "pca__n_components": [2, 4, 6],
    # XGBClassifier
    "model__n_estimators": [100],
    "model__subsample": [0.5, 0.75],
    "model__colsample_bytree": [0.5, 0.75],
    "model__max_depth": [3, 5],
}
pipeline_cv = GridSearchCV(pipeline, grid, verbose=1, cv=2, n_jobs=-1)
pipeline_cv.fit(X_train, y_train)
print(pipeline_cv.score(X_train, y_train))
print(pipeline_cv.score(X_test, y_test))

In [None]:
best_model = pipeline_cv.best_estimator_

with open("saved_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

reset notebook here

In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pickle
import pandas as pd

<IPython.core.display.Javascript object>

In [3]:
data_url = "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Clustering/nba_player_seasons.csv"
nba = pd.read_csv(data_url).dropna()
nba.head(3)

nba = nba.drop("Player", 1)

<IPython.core.display.Javascript object>

In [8]:
new_data = nba.sample(3)
print(new_data["Pos"])
new_data = new_data.drop(columns="Pos")

1063    SG
1269    PF
906     SF
Name: Pos, dtype: object


<IPython.core.display.Javascript object>

In [5]:
with open("saved_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

<IPython.core.display.Javascript object>

In [9]:
preds = loaded_model.predict(new_data)

<IPython.core.display.Javascript object>

In [10]:
labels = nba["Pos"].drop_duplicates().reset_index(drop=True)
label_map = {pos: i for i, pos in labels.iteritems()}
reverse_map = {i: pos for i, pos in labels.iteritems()}

[reverse_map[pred] for pred in preds]


['SG', 'PF', 'SF']

<IPython.core.display.Javascript object>