# Models

Created various models using these machine learning techniques to evaluate which ones to use moving forward.

### Classification (Classify a Winner, Loser, or if both teams Drew)

- Logistic Regression
- Decision Tree
- Random Forest
- K Nearest Neighbors
- Stochastic Gradient Descent
- Naive Bayes
- Neural Network (Multi Layer Perceptron)
- Ensemble Method

### Regression (Predicts an outcome of 0 (Home Win), 1 (Away Win), or 2 (Draw))

- Linear Regression
- Stochastic Gradient Descent
- Support Vector Machines

# Imports

In [25]:
# Data Processing
import pandas as pd

# ML Algorithms
## Classification Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier # Ensemble Classifier

## Regression Algorithms
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn import svm

# ML Tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# ML Evaluation/Metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score

# Loads Data

In [26]:
df = pd.read_csv(r"League_Result_Data/Encoded_PremierLeague_Stats_From_2014to2021.csv")

time_df = pd.read_csv(r"League_Result_Data/TimeEncoded_PremierLeague_Stats_From_2014to2021.csv")

In [27]:
# Had this column get created upon loading, just dropped it
df = df.drop(columns=["Unnamed: 0"]) 
df

Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,HomeTeam,HomeTeam Encoding,AwayTeam,AwayTeam Encoding,FTHG,FTAG,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,Fall,0,12/09/2020,2020/21,Fulham,8,Arsenal,0,0.0,3.0,...,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,Fall,0,12/09/2020,2020/21,Crystal Palace,6,Southampton,21,1.0,0.0,...,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,Fall,0,12/09/2020,2020/21,Liverpool,13,Leeds,11,4.0,3.0,...,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,Fall,0,12/09/2020,2020/21,West Ham,28,Newcastle,17,0.0,2.0,...,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,Fall,0,13/09/2020,2020/21,West Brom,27,Leicester,12,0.0,3.0,...,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,Spring,2,24/05/15,2014/15,Everton,7,Tottenham,25,0.0,1.0,...,1.0,3.0,12.0,8.0,3.0,5.0,1.0,2.0,0.0,0.0
2565,Spring,2,24/05/15,2014/15,Hull,10,Man United,15,0.0,0.0,...,6.0,1.0,12.0,15.0,8.0,1.0,2.0,2.0,0.0,1.0
2566,Spring,2,24/05/15,2014/15,Leicester,12,QPR,19,5.0,1.0,...,7.0,2.0,7.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0
2567,Spring,2,24/05/15,2014/15,Man City,14,Southampton,21,2.0,0.0,...,6.0,4.0,13.0,8.0,8.0,4.0,1.0,1.0,0.0,0.0


In [28]:
time_df

# Had this column get created upon loading, just dropped it
time_df = time_df.drop(columns=["Unnamed: 0"]) 
time_df

Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,Time,Time Encoding,HomeTeam,HomeTeam Encoding,AwayTeam,AwayTeam Encoding,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,Fall,0,12/09/2020,2020/21,Afternoon,0,Fulham,8,Arsenal,0,...,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,Fall,0,12/09/2020,2020/21,Late-Day,2,Crystal Palace,6,Southampton,21,...,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,Fall,0,12/09/2020,2020/21,Late-Day,2,Liverpool,13,Leeds,11,...,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,Fall,0,12/09/2020,2020/21,Late-Day,2,West Ham,28,Newcastle,17,...,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,Fall,0,13/09/2020,2020/21,Mid-Day,1,West Brom,27,Leicester,12,...,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,Summer,3,26/07/2020,2019/20,Mid-Day,1,Leicester,12,Man United,15,...,3.0,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0
666,Summer,3,26/07/2020,2019/20,Mid-Day,1,Man City,14,Norwich,18,...,10.0,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0
667,Summer,3,26/07/2020,2019/20,Mid-Day,1,Newcastle,17,Liverpool,13,...,2.0,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0
668,Summer,3,26/07/2020,2019/20,Mid-Day,1,Southampton,21,Sheffield United,20,...,4.0,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0


# Normal Dataset (Time Isn't Included)

### Split the Data

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Season                 2569 non-null   object 
 1   Season Encoding        2569 non-null   int64  
 2   Date                   2569 non-null   object 
 3   YearOfSeason           2414 non-null   object 
 4   HomeTeam               2569 non-null   object 
 5   HomeTeam Encoding      2569 non-null   int64  
 6   AwayTeam               2569 non-null   object 
 7   AwayTeam Encoding      2569 non-null   int64  
 8   FTHG                   2569 non-null   float64
 9   FTAG                   2569 non-null   float64
 10  FTR                    2569 non-null   object 
 11  FTR Encoding           2569 non-null   int64  
 12  HTHG                   2569 non-null   float64
 13  HTAG                   2569 non-null   float64
 14  HTR                    2569 non-null   object 
 15  HTR 

In [30]:
# Features -- Drops FTR and any categorical value 
X = df.drop(columns=["Season", "YearOfSeason", "Date", "HomeTeam", "AwayTeam", "Referee", "FTR", "FTR Encoding", "HTR", "Referee"])
# Labels
y = df["FTR Encoding"]

In [31]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Season Encoding        2569 non-null   int64  
 1   HomeTeam Encoding      2569 non-null   int64  
 2   AwayTeam Encoding      2569 non-null   int64  
 3   FTHG                   2569 non-null   float64
 4   FTAG                   2569 non-null   float64
 5   HTHG                   2569 non-null   float64
 6   HTAG                   2569 non-null   float64
 7   HTR Encoding           2569 non-null   int64  
 8   Referee Encoding       2569 non-null   int64  
 9   Fouls Called Per Game  2569 non-null   float64
 10  HS                     2569 non-null   float64
 11  AS                     2569 non-null   float64
 12  HST                    2569 non-null   float64
 13  AST                    2569 non-null   float64
 14  HF                     2569 non-null   float64
 15  AF  

In [32]:
X

Unnamed: 0,Season Encoding,HomeTeam Encoding,AwayTeam Encoding,FTHG,FTAG,HTHG,HTAG,HTR Encoding,Referee Encoding,Fouls Called Per Game,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,0,8,0,0.0,3.0,0.0,1.0,1,7,14.0,...,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,0,6,21,1.0,0.0,1.0,0.0,0,27,14.0,...,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,0,13,11,4.0,3.0,3.0,2.0,0,28,14.0,...,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,0,28,17,0.0,2.0,0.0,0.0,2,11,13.0,...,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,0,27,12,0.0,3.0,0.0,0.0,2,10,15.0,...,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,2,7,25,0.0,1.0,0.0,1.0,1,27,14.0,...,1.0,3.0,12.0,8.0,3.0,5.0,1.0,2.0,0.0,0.0
2565,2,10,15,0.0,0.0,0.0,0.0,2,15,13.0,...,6.0,1.0,12.0,15.0,8.0,1.0,2.0,2.0,0.0,1.0
2566,2,12,19,5.0,1.0,2.0,0.0,0,28,14.0,...,7.0,2.0,7.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0
2567,2,14,21,2.0,0.0,1.0,0.0,0,2,14.0,...,6.0,4.0,13.0,8.0,8.0,4.0,1.0,1.0,0.0,0.0


In [33]:
y

0       1
1       0
2       0
3       1
4       1
       ..
2564    1
2565    2
2566    0
2567    0
2568    0
Name: FTR Encoding, Length: 2569, dtype: int64

In [34]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

### Scale Features

In [35]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Classification

### Logistic Regression

Helpful links:

- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

- https://medium.com/analytics-vidhya/l1-vs-l2-regularization-which-is-better-d01068e6658c

In [36]:
log_clf = LogisticRegression(random_state=0, penalty='l2', solver='sag', multi_class='ovr')
log_clf = log_clf.fit(X_train, y_train)

score = log_clf.score(X_test, y_test)

print(score)

0.9863813229571985


Compares the real result to the predicted result

In [37]:
print(y_test.values[1])
print(log_clf.predict([X_test[1]]))

2
[2]


Takes a look at all of the probabilities for the three classes

In [38]:
print(log_clf.predict_proba([X_test[1]]))
print(log_clf.classes_)

[[0.07866251 0.03578909 0.8855484 ]]
[0 1 2]


###### Logistic Regression Evaluation 

In [39]:
# Confusion Matrix

cv_score = cross_val_score(log_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(log_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.98394161 0.97810219 0.98832117] 

[[925   0   0]
 [  0 645   0]
 [ 14  20 451]]


34 mislabeled instances in total.

    14 instances that were labeled as a Draw when they were a win.

    20 instances were labeled as a draw when they were a loss.

In [40]:
# Precision, Recall, F1

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       925
           1       0.97      1.00      0.98       645
           2       1.00      0.93      0.96       485

    accuracy                           0.98      2055
   macro avg       0.99      0.98      0.98      2055
weighted avg       0.98      0.98      0.98      2055



### Decision Tree

Helpful Links:
- https://scikit-learn.org/stable/modules/tree.html

- https://scikit-learn.org/stable/modules/tree.html#classification

In [41]:
tree_clf = DecisionTreeClassifier(max_depth=5)
tree_clf.fit(X_train,y_train)

score = tree_clf.score(X_test, y_test)

print(score)

0.9980544747081712


In [42]:
print(y_test.values[1])
print(tree_clf.predict([X_test[1]]))

2
[2]


In [43]:
print(tree_clf.predict_proba([X_test[1]]))
print(tree_clf.classes_)

[[0. 0. 1.]]
[0 1 2]


###### Decision Tree Evaluation 

In [44]:
# Confusion Matrix

cv_score = cross_val_score(tree_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(tree_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.99854015 0.99416058 0.99854015] 

[[925   0   0]
 [  1 639   5]
 [  0   0 485]]


6 mislabeled instances in total.

    1 instance was labeled as a win when it was a win.

    20 instances were labeled as a loss when they were a draw.

In [45]:
# Precision, Recall, F1

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       925
           1       1.00      0.99      1.00       645
           2       0.99      1.00      0.99       485

    accuracy                           1.00      2055
   macro avg       1.00      1.00      1.00      2055
weighted avg       1.00      1.00      1.00      2055



### Random Forest

Helpful Links:
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

- https://scikit-learn.org/stable/modules/ensemble.html

In [46]:
forest_clf = RandomForestClassifier(n_estimators=10, random_state=42, max_depth=5)

forest_clf = forest_clf.fit(X_train,y_train)

score = forest_clf.score(X_test, y_test)

print(score)

0.8715953307392996


In [47]:
print(y_test.values[1])
print(forest_clf.predict([X_test[1]]))

2
[2]


In [48]:
print(forest_clf.predict_proba([X_test[1]]))
print(forest_clf.classes_)

[[0.30699527 0.14320245 0.54980228]]
[0 1 2]


###### Random Forest Evaluation 

In [49]:
# Confusion Matrix

cv_score = cross_val_score(forest_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(forest_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.87153285 0.82627737 0.90218978] 

[[907  12   6]
 [ 20 599  26]
 [158  52 275]]


274 mislabeled instances in total.

    12 instance was labeled as a win when it was a loss.

    6 instance was labeled as a win when it was a draw.
    
    20 instances were labeled as a loss when they were a win.
    
    26 instances were labeled as a loss when they were a draw.
    
    158 instances were labeled as a draw when they were a win.
    
    52 instances were labeled as a draw when they were a loss.

In [50]:
# Precision, Recall, F1

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.84      0.98      0.90       925
           1       0.90      0.93      0.92       645
           2       0.90      0.57      0.69       485

    accuracy                           0.87      2055
   macro avg       0.88      0.83      0.84      2055
weighted avg       0.87      0.87      0.86      2055



### K Nearest Neighbors

Helpful Links:
- https://scikit-learn.org/stable/modules/neighbors.html

- https://scikit-learn.org/stable/modules/neighbors.html#nearest-neighbors-classification

In [51]:
knn_clf = KNeighborsClassifier(n_neighbors=50)

knn_clf.fit(X_train,y_train)

score = knn_clf.score(X_test, y_test)

print(score)

0.745136186770428


In [52]:
print(y_test.values[1])
print(knn_clf.predict([X_test[1]]))

2
[2]


In [53]:
print(knn_clf.predict_proba([X_test[1]]))
print(knn_clf.classes_)

[[0.32 0.18 0.5 ]]
[0 1 2]


###### K Nearest Neighbors Evaluation 

In [54]:
# Confusion Matrix

cv_score = cross_val_score(knn_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.74160584 0.7459854  0.76058394] 

[[837  22  66]
 [ 39 551  55]
 [185 148 152]]


515 mislabeled instances in total.

    22 instance was labeled as a win when it was a loss.

    66 instance was labeled as a win when it was a draw.
    
    39 instances were labeled as a loss when they were a win.
    
    55 instances were labeled as a loss when they were a draw.
    
    185 instances were labeled as a draw when they were a win.
    
    148 instances were labeled as a draw when they were a loss.

In [55]:
# Precision, Recall, F1

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84       925
           1       0.76      0.85      0.81       645
           2       0.56      0.31      0.40       485

    accuracy                           0.75      2055
   macro avg       0.70      0.69      0.68      2055
weighted avg       0.73      0.75      0.73      2055



### Stochastic Gradient Descent

Helpful Links:
- https://scikit-learn.org/stable/modules/sgd.html

- https://scikit-learn.org/stable/modules/sgd.html#classification

In [56]:
sgd_clf = SGDClassifier(random_state=42)

sgd_clf.fit(X_train, y_train)

score = sgd_clf.score(X_test, y_test)

print(score)

0.9922178988326849


In [57]:
print(y_test.values[1])
print(sgd_clf.predict([X_test[1]]))

2
[2]


In [58]:
print(sgd_clf.predict([X_test[1]]))
print(sgd_clf.classes_)

[2]
[0 1 2]


###### Stochastic Gradient Descent Evaluation 

In [59]:
# Confusion Matrix

cv_score = cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.98394161 0.98248175 0.99708029] 

[[922   0   3]
 [  0 642   3]
 [ 19   0 466]]


25 mislabeled instances in total.

    3 instance was labeled as a win when it was a draw.
    
    3 instances were labeled as a loss when they were a draw.
    
    19 instances were labeled as a draw when they were a win.

In [60]:
# Precision, Recall, F1

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       925
           1       1.00      1.00      1.00       645
           2       0.99      0.96      0.97       485

    accuracy                           0.99      2055
   macro avg       0.99      0.98      0.99      2055
weighted avg       0.99      0.99      0.99      2055



### Naive Bayes

Helpful Links:
- https://scikit-learn.org/stable/modules/naive_bayes.html

- https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes

In [61]:
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

score = nb_clf.score(X_test, y_test)

print(score)

0.745136186770428


In [62]:
print(y_test.values[1])
print(nb_clf.predict([X_test[1]]))

2
[2]


In [63]:
print(nb_clf.predict([X_test[1]]))
print(nb_clf.classes_)

[2]
[0 1 2]


##### Naive Bayes Evaluation

In [64]:
# Confusion Matrix

cv_score = cross_val_score(nb_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(nb_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.71824818 0.72116788 0.75620438] 

[[720  22 183]
 [ 10 475 160]
 [ 95  81 309]]


551 mislabeled instances in total.

    22 instance was labeled as a win when they were a loss.
    
    183 instance was labeled as a win when they were a loss.
    
    10 instances were labeled as a loss when they were a win.
    
    160 instances were labeled as a loss when they were a draw.
    
    95 instances were labeled as a draw when they were a win.
    
    81 instances were labeled as a draw when they were a loss.

### Neural Network

Helpful Links:
- https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier

- https://scikit-learn.org/stable/modules/neural_networks_supervised.html#classification

In [65]:
nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=1)

nn_clf.fit(X_train, y_train)

score = nn_clf.score(X_test, y_test)

print(score)

0.9922178988326849


In [66]:
print(y_test.values[1])
print(nn_clf.predict([X_test[1]]))

2
[2]


In [67]:
print(nn_clf.predict([X_test[1]]))
print(nn_clf.classes_)

[2]
[0 1 2]


##### Neural Network Evaluation

In [68]:
# Confusion Matrix

cv_score = cross_val_score(nn_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(nn_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.99854015 0.99124088 0.99562044] 

[[925   0   0]
 [  0 645   0]
 [  0  10 475]]


10 mislabeled instances in total.
    
    10 instances were labeled as a draw when they were a loss.

### Ensemble Method

Constructed Using:
- Logistics Regression 

- Decision Tree 

- Stochastic Gradient Descent 

- Neural Network

(Use the models made above, did NOT make new models for this)

Helpful Links:
- https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier

- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html#sklearn.ensemble.VotingClassifier

In [69]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('sgd', sgd_clf), ('nn', nn_clf)],
    voting='hard')

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(multi_class='ovr',
                                                 random_state=0,
                                                 solver='sag')),
                             ('dt', DecisionTreeClassifier(max_depth=5)),
                             ('sgd', SGDClassifier(random_state=42)),
                             ('nn',
                              MLPClassifier(alpha=1e-05,
                                            hidden_layer_sizes=(5, 2),
                                            random_state=1, solver='lbfgs'))])

In [70]:
for clf in (log_clf, tree_clf, sgd_clf, nn_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.9863813229571985
DecisionTreeClassifier 0.9980544747081712
SGDClassifier 0.9922178988326849
MLPClassifier 0.9922178988326849
VotingClassifier 0.9941634241245136


##### Ensemble Model Evaluation

In [71]:
# Confusion Matrix

cv_score = cross_val_score(voting_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(voting_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[1. 1. 1.] 

[[925   0   0]
 [  0 645   0]
 [  0   0 485]]


0 mislabeled instances

# Regression

### Linear Regression

Helpful Links:
- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression

- https://scikit-learn.org/stable/modules/linear_model.html#ordinary-least-squares

In [72]:
lin_reg = LinearRegression()
lin_reg = lin_reg.fit(X_train, y_train)

In [73]:
# rounds to the nearest whole number by casting it to an integer value

#for index, values in enumerate(X_test):
#    for i, value in enumerate(values):
#        X_test[index][i] = int(value)
#        if value < 0:
#            X_test[index][i] = 0
            
#X_test

In [74]:
score = lin_reg.score(X_test, y_test)

print(score)

0.3641513803720835


In [75]:
print(y_test.values[1])
print(int(lin_reg.predict([X_test[1]])))

2
1


###### Linear Regression Evaluation 

In [76]:
print(type(X_test[1]))
print(X_test[1])

<class 'numpy.ndarray'>
[-0.2026974  -1.56221251 -0.8613392  -1.13830713 -1.02000491 -0.78555632
 -0.69789433  1.05009218  0.0838971  -1.50954332  0.02546527 -1.07906974
 -0.65560733 -1.75526388  0.1220527   0.25237461 -0.20310757 -0.98521273
 -0.41157592 -0.59517863 -0.22610782 -0.27031258]


In [77]:
results = lin_reg.predict(X_test)

for index, value in enumerate(results):
    results[index] = int(value)
print(results)

[1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1.
 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1.
 1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0.
 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0.
 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1.
 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1.
 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.

In [78]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, results))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.5f'
      % r2_score(y_test, results))

Mean squared error: 0.67
Coefficient of determination: -0.02551


In [79]:
for value in results:
    value = int(value)

results_df = pd.DataFrame({'Actual': y_test, 'Predicted': results})
results_df

Unnamed: 0,Actual,Predicted
672,1,1.0
2380,2,1.0
1183,2,0.0
1602,0,0.0
1055,2,1.0
...,...,...
361,0,0.0
2364,1,1.0
692,2,1.0
1523,1,1.0


### Stochastic Gradient Descent

Helpful Link
- https://scikit-learn.org/stable/modules/sgd.html#regression

In [80]:
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3)
sgd_reg.fit(X_train,y_train)

score = sgd_reg.score(X_test, y_test)

print(score)

0.36537989947101


In [81]:
print(y_test.values[1])
print(int(sgd_reg.predict([X_test[1]])))

2
1


##### Stochastic Gradient Descent Evaluation

In [82]:
results = sgd_reg.predict(X_test)

for index, value in enumerate(results):
    results[index] = int(value)
print(results)

[1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1.
 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1.
 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0.
 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0.
 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1.
 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1.
 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.

In [83]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, results))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.5f'
      % r2_score(y_test, results))

Mean squared error: 0.66
Coefficient of determination: -0.00465


In [84]:
for value in results:
    value = int(value)

results_df = pd.DataFrame({'Actual': y_test, 'Predicted': results})
results_df

Unnamed: 0,Actual,Predicted
672,1,1.0
2380,2,1.0
1183,2,0.0
1602,0,0.0
1055,2,1.0
...,...,...
361,0,0.0
2364,1,1.0
692,2,1.0
1523,1,1.0


### Support Vector Machine

Helpful Links:
- https://scikit-learn.org/stable/modules/svm.html

- https://scikit-learn.org/stable/modules/svm.html#regression

In [85]:
svm_reg = svm.SVR()
svm_reg.fit(X_train, y_train)

score = svm_reg.score(X_test, y_test)

print(score)

0.39868167461405624


In [86]:
print(y_test.values[1])
print(int(svm_reg.predict([X_test[1]])))

2
1


##### Support Vector Machine Evaluation

In [87]:
results = svm_reg.predict(X_test)

for index, value in enumerate(results):
    results[index] = int(value)
print(results)

[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1.
 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0.
 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1.
 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0.
 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0.
 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1.
 1. 1. 2. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.

In [88]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, results))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.5f'
      % r2_score(y_test, results))

Mean squared error: 0.80
Coefficient of determination: -0.22525


In [89]:
for value in results:
    value = int(value)

results_df = pd.DataFrame({'Actual': y_test, 'Predicted': results})
results_df

Unnamed: 0,Actual,Predicted
672,1,0.0
2380,2,1.0
1183,2,0.0
1602,0,0.0
1055,2,1.0
...,...,...
361,0,1.0
2364,1,1.0
692,2,1.0
1523,1,1.0
