<a href="https://colab.research.google.com/github/AmeliaProbst/Coursera_Capstone/blob/master/Mets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Load in libraries/packages
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import io
from google.colab import files

In [48]:
#Load in data
uploaded = files.upload()
df2 = pd.read_csv(io.BytesIO(uploaded['train.csv']))
test = pd.read_csv(io.BytesIO(uploaded['Q2_pitches_test.csv']))

Saving train.csv to train (3).csv
Saving Q2_pitches_test.csv to Q2_pitches_test (7).csv


In [None]:
#Check for NA values
df2.isnull().sum()

test.isnull().sum()

inning                   0
is_bottom                0
balls                    0
strikes                  0
outs_before              0
is_lhp                   0
is_lhb                   0
bat_score_before         0
field_score              0
basecode_before          0
batterid                 0
pitcherid                0
cid                      0
FF                  160306
FT                  160306
CB                  160306
SL                  160306
CH                  160306
dtype: int64

In [49]:
#Categorize variables
df2["is_bottom"] = df2["is_bottom"].astype("category")
df2["pitch_type"] = df2["pitch_type"].astype("category")
df2["is_lhp"] = df2["is_lhp"].astype("category")
df2["is_lhb"] = df2["is_lhb"].astype("category")
df2["basecode_before"] = df2["basecode_before"].astype("category")
df2["inning"] = df2["inning"].astype("category")
df2["batterid"] = df2["batterid"].astype("category")
df2["pitcherid"] = df2["pitcherid"].astype("category")



In [52]:
test["is_bottom"] = test["is_bottom"].astype("category")
test["is_lhp"] = test["is_lhp"].astype("category")
test["is_lhb"] = test["is_lhb"].astype("category")
test["basecode_before"] = test["basecode_before"].astype("category")
test["inning"] = test["inning"].astype("category")
test["batterid"] = test["batterid"].astype("category")
test["pitcherid"] = test["pitcherid"].astype("category")

In [None]:
#Make test and train sets
X = df2[["inning", "is_bottom", "balls", "strikes", "outs_before","is_lhp", "is_lhb", "bat_score_before", "field_score", "basecode_before", "batterid", "pitcherid", "cid"]]
y = df2.loc[:,"pitch_type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) #20% in test data 

clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=26, max_features=4, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=2, oob_score=False, random_state=10,
            verbose=0, warm_start=False)

#Train the model
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) #47% accuracy


Accuracy: 0.4683364004033977


In [None]:
#Looking at important features
features = pd.get_dummies(df2)
feature_list = list(features.columns)

# Get numerical feature importances
importances = list(clf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

#most important features: pitcherid, field_score, score_diff, strikes, count

Variable: pitcherid_1000       Importance: 0.2
Variable: pitcherid_1001       Importance: 0.2
Variable: pitcherid_1002       Importance: 0.12
Variable: field_score          Importance: 0.08
Variable: cid                  Importance: 0.08
Variable: Unnamed: 0           Importance: 0.07
Variable: freq                 Importance: 0.06
Variable: score_diff           Importance: 0.06
Variable: strikes              Importance: 0.05
Variable: count                Importance: 0.03
Variable: balls                Importance: 0.02
Variable: outs_before          Importance: 0.01
Variable: bat_score_before     Importance: 0.01


[None, None, None, None, None, None, None, None, None, None, None, None, None]

In [None]:
# List of features sorted from most to least important
sorted_importances = [importance[1] for importance in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]
# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)

print('Number of features for 95% importance:', np.where(cumulative_importances > 0.95)[0][0] + 1)

# Extract the names of the most important features
important_feature_names = [feature[0] for feature in feature_importances[0:9]]
# Find the columns of the most important features
important_indices = [feature_list.index(feature) for feature in important_feature_names]
# Create training and testing sets with only the important features
important_train_features = X_train.iloc[:, important_indices]
important_test_features = X_test.iloc[:, important_indices]

#Fit model on important features
clf.fit(important_train_features, y_train)
y_pred=clf.predict(important_test_features)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) #44% accuracy


Number of features for 95% importance: 10
Accuracy: 0.4598525726999574


In [53]:
#Predicting the final results
test = test[["batterid", "pitcherid", "cid", "bat_score_before", "field_score", "inning", "balls", "basecode_before", "outs_before"]]
y_pred = clf.predict_proba(test)#probability of each pitch

#writing to a csv file
results = pd.DataFrame(y_pred, columns=["CB", "CH", "FF", "FT", "SL"])
results.to_csv('prob.csv', index=False)

160306

In [None]:
""