## IA 651 Project Code (Linear Regression and SVR with less features)

Using features not associated with a team's goal_differential to fit a linear model and assess it compared to other models.

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from pca import pca
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics as metrics
from sklearn.tree import plot_tree

from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

Leading in the same data:

In [2]:
df = pd.read_csv("nwsl-team-stats.csv")
print(df.shape)

(59, 13)


In [3]:
df.head()

Unnamed: 0,team_name,season,games_played,goal_differential,goals,goals_conceded,cross_accuracy,goal_conversion_pct,pass_pct,pass_pct_opposition_half,possession_pct,shot_accuracy,tackle_success_pct
0,Boston Breakers,2016,20,-33,14,47,25.57,8.97,67.38,57.86,47,42.95,77.42
1,Boston Breakers,2017,24,-11,24,35,23.7,12.37,72.53,61.42,48,42.78,73.49
2,Chicago Red Stars,2016,21,3,25,22,21.19,11.79,67.35,57.74,46,48.58,84.32
3,Chicago Red Stars,2017,25,2,33,31,21.08,13.1,69.23,61.52,47,49.6,71.29
4,Chicago Red Stars,2018,25,8,38,30,25.96,13.67,71.63,64.55,51,45.68,67.97


Selecting the new features for the linear regression model:

In [4]:
features = ['games_played', 'cross_accuracy',
       'pass_pct', 'pass_pct_opposition_half','possession_pct', 'tackle_success_pct']

In [5]:
y = df['goal_differential']

In [6]:
X = df[features]
data = X.copy()
data.head()

Unnamed: 0,games_played,cross_accuracy,pass_pct,pass_pct_opposition_half,possession_pct,tackle_success_pct
0,20,25.57,67.38,57.86,47,77.42
1,24,23.7,72.53,61.42,48,73.49
2,21,21.19,67.35,57.74,46,84.32
3,25,21.08,69.23,61.52,47,71.29
4,25,25.96,71.63,64.55,51,67.97


Creating a train test split:

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Scaling the data:

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Linear Regression

Fitting the linear regression model:

In [9]:
model = LinearRegression()

model.fit(X_train_scaled, y_train)

y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)


Obtaining MSE and R Squared values:

In [10]:
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
print(f"Training set - MSE: {mse_train:.2f}, R2: {r2_train:.2f}")


mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
print(f"Test set - MSE: {mse_test:.2f}, R2: {r2_test:.2f}")


Training set - MSE: 121.88, R2: 0.36
Test set - MSE: 192.14, R2: 0.46


Obtaining the coefficents for the x variables:

In [11]:
model.coef_

array([ 2.30852634,  2.82975781, -5.22457637,  2.46081466,  8.27211604,
       -1.99293062])

Obtaining the intercept value:

In [12]:
model.intercept_

0.34042553191487984

### SVR

Performing SVR with the reduced features:

In [13]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(
    X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(
    X_test), columns=X_train.columns, index=X_test.index)

In [14]:
model = SVR(kernel = 'rbf', C=1, gamma=1)
model.fit(X_train_scaled, y_train)


y_test_pred_SVR = model.predict(X_test_scaled)
y_train_pred_SVR = model.predict(X_train_scaled)

In [15]:
mse_test_SVR = mean_squared_error(y_test, y_test_pred_SVR)
mse_train_SVR = mean_squared_error(y_train, y_train_pred_SVR)

print(f'Training MSE SVR: {mse_train_SVR}')
print(f'Test MSE SVR: {mse_test_SVR}')

Training MSE SVR: 163.21296763665762
Test MSE SVR: 349.7363627189634


In [16]:
r2_train_SVR = r2_score(y_train,y_train_pred_SVR)
r2_test_SVR = r2_score(y_test,y_test_pred_SVR)

print(f'Training R Squared SVR: {r2_train_SVR}')
print(f'Test R Squared SVR: {r2_test_SVR}')

Training R Squared SVR: 0.13717298421136292
Test R Squared SVR: 0.012257315952483028


Performing a grid search to find optimal gamma and C:

In [17]:
folds = KFold(n_splits=5, shuffle=True, random_state=7)
model = SVR()

params = {'C': [0.1, 1, 10, 100, 1000, 10000, 20000],
          'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

c_opt = GridSearchCV(estimator=model, param_grid=params,
                     scoring='neg_mean_squared_error', cv=folds, n_jobs=-1,
                     verbose=1, return_train_score=True)

c_opt.fit(X_train_scaled, y_train)
c_results = pd.DataFrame(c_opt.cv_results_)

print(f'Negative MSE: {c_opt.best_score_}')
print(f'Best Parameters: {c_opt.best_params_}')

Fitting 5 folds for each of 35 candidates, totalling 175 fits
Negative MSE: -108.45965244112892
Best Parameters: {'C': 1000, 'gamma': 0.01}


Fitting SVR again but with optimal gamma and C:

In [18]:
model = SVR(kernel = 'rbf', C=1000, gamma=0.01)
model.fit(X_train_scaled, y_train)


y_test_pred_SVR = model.predict(X_test_scaled)
y_train_pred_SVR = model.predict(X_train_scaled)

Obtaining the MSE values:

In [19]:
mse_test_SVR = mean_squared_error(y_test, y_test_pred_SVR)
mse_train_SVR = mean_squared_error(y_train, y_train_pred_SVR)

print(f'Training MSE SVR: {mse_train_SVR}')
print(f'Test MSE SVR: {mse_test_SVR}')

Training MSE SVR: 68.0004078469631
Test MSE SVR: 438.1960227751694


Obtaining the R Squared values:

In [20]:
r2_train_SVR = r2_score(y_train,y_train_pred_SVR)
r2_test_SVR = r2_score(y_test,y_test_pred_SVR)

print(f'Training R Squared SVR: {r2_train_SVR}')
print(f'Test R Squared SVR: {r2_test_SVR}')

Training R Squared SVR: 0.6405151513106393
Test R Squared SVR: -0.23757481867190489
