# EDA - FB/RB Analysis

---
## Imports

In [None]:
%matplotlib inline 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
rs = np.random.RandomState(8)
sns.set(color_codes=True)
pd.options.mode.chained_assignment = None  # default='warn'

## Introduction
Data analysis on the FB/RB positions to find relation between data and Pro Bowls

## Loading Data
All data was pulled from the github project 

In [None]:
df = pd.read_csv("C:/Users/matth/Documents/GitHub/NFL_Success/final_df.csv")
positions = ['TE', 'QB']
df = df[~df.Pos.isin(positions)]
df.drop('Rnd', axis=1, inplace=True)
df.to_csv('my_analysis.csv')
full_df = pd.read_csv('my_analysis.csv')
wr_df = full_df[full_df['Pos'] == 'WR']
fr_df = full_df[full_df['Pos'] != 'WR']

In [None]:
wr_df[0:5]

In [None]:
fr_df[0:5]

## Analysis
---

### Heat map for college stats

In [None]:
hm = fr_df[['PB','C Games','C Rec', 'C RecYds', 'C  RecAvg', 'C RecTD', 'C ScrimPLays', 'C ScrimYds','C ScrimAvg',
            'C ScrimTD']]
plt.figure(figsize=(10,7))
cmap = sns.diverging_palette(255, 12, as_cmap=True)
g = sns.heatmap(hm.corr(), annot=True, cmap=cmap)
g.set_title('Heat map for College Stats')

### Heat map for combine stats

In [None]:
hm = fr_df[['PB','Round','Wt', '40yd', 'Vertical', 'Broad Jump', '3Cone', 'Shuttle','Pick', 'Year']]
plt.figure(figsize=(10,7))
cmap = sns.diverging_palette(255, 12, as_cmap=True)
g = sns.heatmap(hm.corr(), annot=True, cmap=cmap)
g.set_title('Heat map for Combine Stats')

### Number of Pro Bowls based on round drafted

In [None]:
g = sns.barplot(x='Round', y='PB', data=fr_df, palette='bright')
g.set_title('PB per round')

### Number of Pro Bowls based on bench reps at combine and round drafted

In [None]:
fr_df['Bench Reps above average'] = fr_df['Bench'] > fr_df['Bench'].mean()
g = sns.barplot(x='Round', y='PB', hue='Bench Reps above average', data=fr_df, palette='bright')
g.set_title('PB per round by bench')

### Number of Pro Bowls based on 40yd time at combine and round drafted

In [None]:
fr_df['40yd time quicker than average'] = fr_df['40yd'] < fr_df['40yd'].mean()
g = sns.barplot(x='Round', y='PB', hue='40yd time quicker than average', data=fr_df, palette='bright')
g.set_title('PB per round by average 40yd time')

### Number of Pro Bowls based on vertical height at combine and round drafted

In [None]:
fr_df['Vertical height above average'] = fr_df['Vertical'] > fr_df['Vertical'].mean()
g = sns.barplot(x='Round', y='PB', hue='Vertical height above average', data=fr_df, palette='bright')
g.set_title('PB per round by average vertical')

### Number of Pro Bowls based on amount of college scrimmage yards

In [None]:
g = sns.jointplot(x='C ScrimYds', y='PB', data=fr_df, color="#4CB391")

### Number of Pro Bowls based on number of college scrimmage plays

In [None]:
g = sns.jointplot(x='C ScrimPLays', y='PB', data=fr_df, color="#4CB391")

### Number of Pro Bowls based on number of college rushing touchdowns

In [None]:
g = sns.barplot(x='C RushTD', y='PB', data=fr_df, palette='bright')
for label in g.get_xticklabels()[::2]:
     label.set_visible(False)
for label in g.get_xticklabels()[::2]:
     label.set_visible(False)
g.set_title('PB by College Scrim TD')

### Number of Pro Bowls based on number of college scrimmage touchdowns

In [None]:
g = sns.barplot(x='C ScrimTD', y='PB', data=fr_df, palette='bright')
for label in g.get_xticklabels()[::2]:
     label.set_visible(False)
for label in g.get_xticklabels()[::2]:
     label.set_visible(False)
g.set_title('PB by College Scrim TD')
plt.show()

### Number of Pro Bowls based on number of college rushing touchdowns and average college scrimmage touchdowns 

In [None]:
fr_df['College Scrimmage Touchdowns above average'] = fr_df['C ScrimTD'] > fr_df['C ScrimTD'].mean()
g = sns.lineplot(x='C RushTD', y='PB', hue='College Scrimmage Touchdowns above average', data=fr_df, palette='bright')
g.set_title('PB by College Rush TD and average Scrim TD')
plt.show()

## Model
---

### FB/RB
Naive Bayes Model

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
y = fr_df['PB'] > 0
features = ['Round', '40yd', 'Wt', 'Vertical', 'C ScrimTD', 'C RecTD']
X = fr_df[features].fillna(0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print("Confidence:",nb.score(X_test, y_test))
print("Number of mislabeled points out of a total %d points : %d"
     % (X_test.shape[0], (y_test != y_pred).sum()))