## Packages

In [3]:
# Cleaning
import pandas as pd, numpy as np, seaborn as sns
import glob
import datetime
import matplotlib.pyplot as plt
import math
from scipy import stats

#Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

#Feature Selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import mutual_info_classif

#Models
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

#Metrics
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix



## Read Data

In [4]:
final_df_t1 = pd.read_csv("weighted_average_table/average_table_08_18.csv")

In [None]:
clean_df_t1 = final_df_t1.drop(list(final_df_t1.columns[["Team" in i for i in final_df_t1.columns]]), axis = 1)

In [None]:
print(clean_df_t1.columns)
clean_df_t1.head()

## Data Analysis

### Missing Data

In [None]:
sum(clean_df_t1.isnull().apply(sum, axis = 0))

### Balanced or Unbalanced Target Variable

In [None]:
print('Home Team Win Rate: ' + str(round(sum(clean_df_t1["HomeWin"] == 1) / len(clean_df_t1["HomeWin"]), 3)))

### Descriptive Statistics: Team Level

#### Home Team

In [None]:
home_colname = clean_df_t1.columns[["_h" in i for i in clean_df_t1.columns]]
home_df = clean_df_t1.loc[:, home_colname]
home_df.describe()

#### Away Team

In [None]:
away_colname = clean_df_t1.columns[["_a" in i for i in clean_df_t1.columns]]
away_df = clean_df_t1.loc[:, away_colname]
away_df.describe()

#### Box Plot: Home and Away

In [None]:
def boxplot_features(table):
    plt.figure(figsize=(20,10))
    bxplot_home = sns.boxplot(x="variable", y="value", data = pd.melt(table))
    plt.xticks(label = table.columns, rotation='vertical', fontsize = 18)
    plt.yticks(fontsize = 18)
    plt.xlabel('variables', fontsize = 18)
    plt.ylabel('value', fontsize = 18)e
    plt.title("Team Features Box Plot", fontsize = 20)

In [None]:
# Home
boxplot_features(home_df)

In [None]:
boxplot_features(away_df)

### Distribution

In [None]:
#home_basic_df = 

In [None]:
f, axes = plt.subplots(4, 5, figsize=(15, 14), sharex=False)
for i in range(19):
    sns.distplot(home_basic_df.iloc[:, i], kde = False, ax = axes[i%4, math.floor(i/4)])

In [None]:
#home_advanced_df = 

In [None]:
f, axes = plt.subplots(4, 4, figsize=(12, 15), sharex=False)
for i in range(14):
    sns.distplot(home_advanced_df.iloc[:, i], kde = False, ax = axes[i%4, math.floor(i/4)])

### Heatmap Correlation

In [None]:
def corrheatmap(correlation):
    # add a mask to the upper right triangle
    heatmap_mask = np.zeros_like(correlation, dtype=np.bool)
    heatmap_mask[np.triu_indices_from(heatmap_mask)] = True

    # adjust the figure size
    f, ax = plt.subplots(figsize=(11, 9))

    # the color bar
    color_map = sns.diverging_palette(220, 10, as_cmap=True)
    # the main heatmap plot
    sns.heatmap(correlation, mask = heatmap_mask, cmap = color_map,
                square = True, linewidths = 3., center = 0,
                vmax = 1, vmin = -1, cbar_kws={"shrink": .7})

In [None]:
home_corr = home_df.corr()
corrheatmap(home_corr)

### Box Plot: Features Distribution vs. Target Variable Categories

In [None]:
f, axes = plt.subplots(2, 1, figsize = (12, 15), sharex = False)
data_left = pd.melt(home_df.iloc[:, :-2], id_vars = "HomeWin", var_name="variables", value_name='value')
data_right = pd.melt(home_df.loc[:, ["HomeWin", "ORtg_h", "DRtg_h"]], id_vars = "HomeWin", var_name="variables", value_name='value')
for i in range(2):
    sns.boxplot(x = "variables", y = "value", hue = "HomeWin", data = [data_left, data_right][i], ax = axes[i])
    plt.xticks(rotation=90)