# Models

Created various models using these machine learning techniques to evaluate which ones to use moving forward.

### Classification (Classify a Winner, Loser, or if both teams Drew)

- Logistic Regression
- Decision Tree
- Random Forest
- K Nearest Neighbors
- Stochastic Gradient Descent
- Naive Bayes
- Neural Network (Multi Layer Perceptron)
- Ensemble Method

### Regression (Predicts an outcome of 0 (Home Win), 1 (Away Win), or 2 (Draw))

- Linear Regression
- Stochastic Gradient Descent
- Support Vector Machines

# Imports

In [2]:
# Data Processing
import pandas as pd

# ML Algorithms
## Classification Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier # Ensemble Classifier

## Regression Algorithms
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn import svm

# ML Tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# ML Evaluation/Metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score

# Loads Data

In [3]:
df = pd.read_csv(r"League_Result_Data/Encoded_PremierLeague_Stats_From_2014to2021.csv")

time_df = pd.read_csv(r"League_Result_Data/TimeEncoded_PremierLeague_Stats_From_2014to2021.csv")

In [4]:
# Had this column get created upon loading, just dropped it
df = df.drop(columns=["Unnamed: 0"]) 
df

Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,HomeTeam,HomeTeam Encoding,AwayTeam,AwayTeam Encoding,FTHG,FTAG,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,Fall,0,12/09/2020,2020/21,Fulham,8,Arsenal,0,0.0,3.0,...,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,Fall,0,12/09/2020,2020/21,Crystal Palace,6,Southampton,21,1.0,0.0,...,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,Fall,0,12/09/2020,2020/21,Liverpool,13,Leeds,11,4.0,3.0,...,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,Fall,0,12/09/2020,2020/21,West Ham,28,Newcastle,17,0.0,2.0,...,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,Fall,0,13/09/2020,2020/21,West Brom,27,Leicester,12,0.0,3.0,...,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,Spring,2,24/05/15,2014/15,Everton,7,Tottenham,25,0.0,1.0,...,1.0,3.0,12.0,8.0,3.0,5.0,1.0,2.0,0.0,0.0
2565,Spring,2,24/05/15,2014/15,Hull,10,Man United,15,0.0,0.0,...,6.0,1.0,12.0,15.0,8.0,1.0,2.0,2.0,0.0,1.0
2566,Spring,2,24/05/15,2014/15,Leicester,12,QPR,19,5.0,1.0,...,7.0,2.0,7.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0
2567,Spring,2,24/05/15,2014/15,Man City,14,Southampton,21,2.0,0.0,...,6.0,4.0,13.0,8.0,8.0,4.0,1.0,1.0,0.0,0.0


In [5]:
time_df

# Had this column get created upon loading, just dropped it
time_df = time_df.drop(columns=["Unnamed: 0"]) 
time_df

Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,Time,Time Encoding,HomeTeam,HomeTeam Encoding,AwayTeam,AwayTeam Encoding,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,Fall,0,12/09/2020,2020/21,Afternoon,0,Fulham,8,Arsenal,0,...,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,Fall,0,12/09/2020,2020/21,Late-Day,2,Crystal Palace,6,Southampton,21,...,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,Fall,0,12/09/2020,2020/21,Late-Day,2,Liverpool,13,Leeds,11,...,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,Fall,0,12/09/2020,2020/21,Late-Day,2,West Ham,28,Newcastle,17,...,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,Fall,0,13/09/2020,2020/21,Mid-Day,1,West Brom,27,Leicester,12,...,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,Summer,3,26/07/2020,2019/20,Mid-Day,1,Leicester,12,Man United,15,...,3.0,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0
666,Summer,3,26/07/2020,2019/20,Mid-Day,1,Man City,14,Norwich,18,...,10.0,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0
667,Summer,3,26/07/2020,2019/20,Mid-Day,1,Newcastle,17,Liverpool,13,...,2.0,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0
668,Summer,3,26/07/2020,2019/20,Mid-Day,1,Southampton,21,Sheffield United,20,...,4.0,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0


# Normal Dataset (Time Isn't Included)

### Split the Data

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Season                 2569 non-null   object 
 1   Season Encoding        2569 non-null   int64  
 2   Date                   2569 non-null   object 
 3   YearOfSeason           2414 non-null   object 
 4   HomeTeam               2569 non-null   object 
 5   HomeTeam Encoding      2569 non-null   int64  
 6   AwayTeam               2569 non-null   object 
 7   AwayTeam Encoding      2569 non-null   int64  
 8   FTHG                   2569 non-null   float64
 9   FTAG                   2569 non-null   float64
 10  FTR                    2569 non-null   object 
 11  FTR Encoding           2569 non-null   int64  
 12  HTHG                   2569 non-null   float64
 13  HTAG                   2569 non-null   float64
 14  HTR                    2569 non-null   object 
 15  HTR 

In [7]:
# Features -- Drops FTR and any categorical value 
X = df.drop(columns=["Season", "YearOfSeason", "Date", "FTHG","FTAG" ,"HomeTeam", "AwayTeam", "Referee", "FTR", "FTR Encoding", "HTR", "Referee"])
# Labels
y = df["FTR Encoding"]

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Season Encoding        2569 non-null   int64  
 1   HomeTeam Encoding      2569 non-null   int64  
 2   AwayTeam Encoding      2569 non-null   int64  
 3   HTHG                   2569 non-null   float64
 4   HTAG                   2569 non-null   float64
 5   HTR Encoding           2569 non-null   int64  
 6   Referee Encoding       2569 non-null   int64  
 7   Fouls Called Per Game  2569 non-null   float64
 8   HS                     2569 non-null   float64
 9   AS                     2569 non-null   float64
 10  HST                    2569 non-null   float64
 11  AST                    2569 non-null   float64
 12  HF                     2569 non-null   float64
 13  AF                     2569 non-null   float64
 14  HC                     2569 non-null   float64
 15  AC  

In [9]:
X

Unnamed: 0,Season Encoding,HomeTeam Encoding,AwayTeam Encoding,HTHG,HTAG,HTR Encoding,Referee Encoding,Fouls Called Per Game,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,0,8,0,0.0,1.0,1,7,14.0,5.0,13.0,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,0,6,21,1.0,0.0,0,27,14.0,5.0,9.0,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,0,13,11,3.0,2.0,0,28,14.0,22.0,6.0,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,0,28,17,0.0,0.0,2,11,13.0,15.0,15.0,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,0,27,12,0.0,0.0,2,10,15.0,7.0,13.0,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,2,7,25,0.0,1.0,1,27,14.0,9.0,16.0,1.0,3.0,12.0,8.0,3.0,5.0,1.0,2.0,0.0,0.0
2565,2,10,15,0.0,0.0,2,15,13.0,16.0,7.0,6.0,1.0,12.0,15.0,8.0,1.0,2.0,2.0,0.0,1.0
2566,2,12,19,2.0,0.0,0,28,14.0,22.0,18.0,7.0,2.0,7.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0
2567,2,14,21,1.0,0.0,0,2,14.0,15.0,13.0,6.0,4.0,13.0,8.0,8.0,4.0,1.0,1.0,0.0,0.0


In [10]:
y

0       1
1       0
2       0
3       1
4       1
       ..
2564    1
2565    2
2566    0
2567    0
2568    0
Name: FTR Encoding, Length: 2569, dtype: int64

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

### Scale Features

In [12]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Classification

### Logistic Regression

Helpful links:

- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

- https://medium.com/analytics-vidhya/l1-vs-l2-regularization-which-is-better-d01068e6658c

In [13]:
log_clf = LogisticRegression(random_state=0, penalty='l2', solver='sag', multi_class='ovr')
log_clf = log_clf.fit(X_train, y_train)

score = log_clf.score(X_test, y_test)

print(score)

0.6595330739299611


Compares the real result to the predicted result

In [14]:
print(y_test.values[1])
print(log_clf.predict([X_test[1]]))

2
[0]


Takes a look at all of the probabilities for the three classes

In [15]:
print(log_clf.predict_proba([X_test[1]]))
print(log_clf.classes_)

[[0.46778687 0.13272404 0.39948909]]
[0 1 2]


###### Logistic Regression Evaluation 

In [16]:
# Confusion Matrix

cv_score = cross_val_score(log_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(log_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.67591241 0.6729927  0.67737226] 

[[781  75  69]
 [ 82 493  70]
 [214 157 114]]


34 mislabeled instances in total.

    14 instances that were labeled as a Draw when they were a win.

    20 instances were labeled as a draw when they were a loss.

In [17]:
# Precision, Recall, F1

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.73      0.84      0.78       925
           1       0.68      0.76      0.72       645
           2       0.45      0.24      0.31       485

    accuracy                           0.68      2055
   macro avg       0.62      0.61      0.60      2055
weighted avg       0.65      0.68      0.65      2055



### Decision Tree

Helpful Links:
- https://scikit-learn.org/stable/modules/tree.html

- https://scikit-learn.org/stable/modules/tree.html#classification

In [18]:
tree_clf = DecisionTreeClassifier(max_depth=5)
tree_clf.fit(X_train,y_train)

score = tree_clf.score(X_test, y_test)

print(score)

0.5953307392996109


In [19]:
print(y_test.values[1])
print(tree_clf.predict([X_test[1]]))

2
[0]


In [20]:
print(tree_clf.predict_proba([X_test[1]]))
print(tree_clf.classes_)

[[0.39726027 0.21369863 0.3890411 ]]
[0 1 2]


###### Decision Tree Evaluation 

In [21]:
# Confusion Matrix

cv_score = cross_val_score(tree_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(tree_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.62773723 0.64233577 0.64233577] 

[[796  72  57]
 [145 450  50]
 [278 141  66]]


6 mislabeled instances in total.

    1 instance was labeled as a win when it was a win.

    20 instances were labeled as a loss when they were a draw.

In [22]:
# Precision, Recall, F1

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.65      0.86      0.74       925
           1       0.68      0.70      0.69       645
           2       0.38      0.14      0.20       485

    accuracy                           0.64      2055
   macro avg       0.57      0.56      0.54      2055
weighted avg       0.60      0.64      0.60      2055



### Random Forest

Helpful Links:
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

- https://scikit-learn.org/stable/modules/ensemble.html

In [23]:
forest_clf = RandomForestClassifier(n_estimators=10, random_state=42, max_depth=5)

forest_clf = forest_clf.fit(X_train,y_train)

score = forest_clf.score(X_test, y_test)

print(score)

0.632295719844358


In [24]:
print(y_test.values[1])
print(forest_clf.predict([X_test[1]]))

2
[0]


In [25]:
print(forest_clf.predict_proba([X_test[1]]))
print(forest_clf.classes_)

[[0.51739847 0.18004875 0.30255278]]
[0 1 2]


###### Random Forest Evaluation 

In [26]:
# Confusion Matrix

cv_score = cross_val_score(forest_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(forest_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.66277372 0.65255474 0.64525547] 

[[777 100  48]
 [115 496  34]
 [236 179  70]]


274 mislabeled instances in total.

    12 instance was labeled as a win when it was a loss.

    6 instance was labeled as a win when it was a draw.
    
    20 instances were labeled as a loss when they were a win.
    
    26 instances were labeled as a loss when they were a draw.
    
    158 instances were labeled as a draw when they were a win.
    
    52 instances were labeled as a draw when they were a loss.

In [27]:
# Precision, Recall, F1

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.69      0.84      0.76       925
           1       0.64      0.77      0.70       645
           2       0.46      0.14      0.22       485

    accuracy                           0.65      2055
   macro avg       0.60      0.58      0.56      2055
weighted avg       0.62      0.65      0.61      2055



### K Nearest Neighbors

Helpful Links:
- https://scikit-learn.org/stable/modules/neighbors.html

- https://scikit-learn.org/stable/modules/neighbors.html#nearest-neighbors-classification

In [28]:
knn_clf = KNeighborsClassifier(n_neighbors=50)

knn_clf.fit(X_train,y_train)

score = knn_clf.score(X_test, y_test)

print(score)

0.632295719844358


In [29]:
print(y_test.values[1])
print(knn_clf.predict([X_test[1]]))

2
[2]


In [30]:
print(knn_clf.predict_proba([X_test[1]]))
print(knn_clf.classes_)

[[0.34 0.24 0.42]]
[0 1 2]


###### K Nearest Neighbors Evaluation 

In [31]:
# Confusion Matrix

cv_score = cross_val_score(knn_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.66131387 0.62773723 0.6540146 ] 

[[764  98  63]
 [116 489  40]
 [229 178  78]]


515 mislabeled instances in total.

    22 instance was labeled as a win when it was a loss.

    66 instance was labeled as a win when it was a draw.
    
    39 instances were labeled as a loss when they were a win.
    
    55 instances were labeled as a loss when they were a draw.
    
    185 instances were labeled as a draw when they were a win.
    
    148 instances were labeled as a draw when they were a loss.

In [32]:
# Precision, Recall, F1

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.69      0.83      0.75       925
           1       0.64      0.76      0.69       645
           2       0.43      0.16      0.23       485

    accuracy                           0.65      2055
   macro avg       0.59      0.58      0.56      2055
weighted avg       0.61      0.65      0.61      2055



### Stochastic Gradient Descent

Helpful Links:
- https://scikit-learn.org/stable/modules/sgd.html

- https://scikit-learn.org/stable/modules/sgd.html#classification

In [33]:
sgd_clf = SGDClassifier(random_state=42)

sgd_clf.fit(X_train, y_train)

score = sgd_clf.score(X_test, y_test)

print(score)

0.6634241245136187


In [34]:
print(y_test.values[1])
print(sgd_clf.predict([X_test[1]]))

2
[2]


In [35]:
print(sgd_clf.predict([X_test[1]]))
print(sgd_clf.classes_)

[2]
[0 1 2]


###### Stochastic Gradient Descent Evaluation 

In [36]:
# Confusion Matrix

cv_score = cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.65109489 0.61313869 0.66277372] 

[[725  87 113]
 [ 91 460  94]
 [194 156 135]]


25 mislabeled instances in total.

    3 instance was labeled as a win when it was a draw.
    
    3 instances were labeled as a loss when they were a draw.
    
    19 instances were labeled as a draw when they were a win.

In [37]:
# Precision, Recall, F1

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.72      0.78      0.75       925
           1       0.65      0.71      0.68       645
           2       0.39      0.28      0.33       485

    accuracy                           0.64      2055
   macro avg       0.59      0.59      0.59      2055
weighted avg       0.62      0.64      0.63      2055



### Naive Bayes

Helpful Links:
- https://scikit-learn.org/stable/modules/naive_bayes.html

- https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes

In [38]:
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

score = nb_clf.score(X_test, y_test)

print(score)

0.6342412451361867


In [39]:
print(y_test.values[1])
print(nb_clf.predict([X_test[1]]))

2
[2]


In [40]:
print(nb_clf.predict([X_test[1]]))
print(nb_clf.classes_)

[2]
[0 1 2]


##### Naive Bayes Evaluation

In [41]:
# Confusion Matrix

cv_score = cross_val_score(nb_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(nb_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

[0.64671533 0.62627737 0.64525547] 

[[677  84 164]
 [ 61 455 129]
 [152 151 182]]


551 mislabeled instances in total.

    22 instance was labeled as a win when they were a loss.
    
    183 instance was labeled as a win when they were a loss.
    
    10 instances were labeled as a loss when they were a win.
    
    160 instances were labeled as a loss when they were a draw.
    
    95 instances were labeled as a draw when they were a win.
    
    81 instances were labeled as a draw when they were a loss.

### Neural Network

Helpful Links:
- https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier

- https://scikit-learn.org/stable/modules/neural_networks_supervised.html#classification

In [42]:
nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=1)

nn_clf.fit(X_train, y_train)

score = nn_clf.score(X_test, y_test)

print(score)

0.6206225680933852


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [43]:
print(y_test.values[1])
print(nn_clf.predict([X_test[1]]))

2
[0]


In [44]:
print(nn_clf.predict([X_test[1]]))
print(nn_clf.classes_)

[0]
[0 1 2]


##### Neural Network Evaluation

In [45]:
# Confusion Matrix

cv_score = cross_val_score(nn_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(nn_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[0.61751825 0.61167883 0.65839416] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[[728  34 163]
 [ 83 391 171]
 [215  96 174]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


10 mislabeled instances in total.
    
    10 instances were labeled as a draw when they were a loss.

### Ensemble Method

Constructed Using:
- Logistics Regression 

- Decision Tree 

- Stochastic Gradient Descent 

- Neural Network

(Use the models made above, did NOT make new models for this)

Helpful Links:
- https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier

- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html#sklearn.ensemble.VotingClassifier

In [46]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('sgd', sgd_clf), ('nn', nn_clf)],
    voting='hard')

voting_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


VotingClassifier(estimators=[('lr',
                              LogisticRegression(multi_class='ovr',
                                                 random_state=0,
                                                 solver='sag')),
                             ('dt', DecisionTreeClassifier(max_depth=5)),
                             ('sgd', SGDClassifier(random_state=42)),
                             ('nn',
                              MLPClassifier(alpha=1e-05,
                                            hidden_layer_sizes=(5, 2),
                                            random_state=1, solver='lbfgs'))])

In [47]:
for clf in (log_clf, tree_clf, sgd_clf, nn_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.6595330739299611
DecisionTreeClassifier 0.5972762645914397
SGDClassifier 0.6634241245136187


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier 0.6206225680933852
VotingClassifier 0.6556420233463035


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


##### Ensemble Model Evaluation

In [48]:
# Confusion Matrix

cv_score = cross_val_score(voting_clf, X_train, y_train, cv=3, scoring='accuracy')

print(cv_score, '\n')

y_train_pred = cross_val_predict(voting_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[0.66715328 0.66423358 0.68029197] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[[800  67  58]
 [103 487  55]
 [242 152  91]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0 mislabeled instances

# Regression

### Linear Regression

Helpful Links:
- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression

- https://scikit-learn.org/stable/modules/linear_model.html#ordinary-least-squares

In [49]:
lin_reg = LinearRegression()
lin_reg = lin_reg.fit(X_train, y_train)

In [50]:
# rounds to the nearest whole number by casting it to an integer value

#for index, values in enumerate(X_test):
#    for i, value in enumerate(values):
#        X_test[index][i] = int(value)
#        if value < 0:
#            X_test[index][i] = 0
            
#X_test

In [51]:
score = lin_reg.score(X_test, y_test)

print(score)

0.20809507682571227


In [52]:
print(y_test.values[1])
print(int(lin_reg.predict([X_test[1]])))

2
0


###### Linear Regression Evaluation 

In [53]:
print(type(X_test[1]))
print(X_test[1])

<class 'numpy.ndarray'>
[-0.2026974  -1.56221251 -0.8613392  -0.78555632 -0.69789433  1.05009218
  0.0838971  -1.50954332  0.02546527 -1.07906974 -0.65560733 -1.75526388
  0.1220527   0.25237461 -0.20310757 -0.98521273 -0.41157592 -0.59517863
 -0.22610782 -0.27031258]


In [54]:
results = lin_reg.predict(X_test)

for index, value in enumerate(results):
    results[index] = int(value)
print(results)

[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0.
 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1.
 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0.
 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0.
 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0.
 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1.
 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.

In [55]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, results))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.5f'
      % r2_score(y_test, results))

Mean squared error: 0.87
Coefficient of determination: -0.33257


In [56]:
for value in results:
    value = int(value)

results_df = pd.DataFrame({'Actual': y_test, 'Predicted': results})
results_df

Unnamed: 0,Actual,Predicted
672,1,0.0
2380,2,0.0
1183,2,0.0
1602,0,0.0
1055,2,1.0
...,...,...
361,0,0.0
2364,1,0.0
692,2,1.0
1523,1,1.0


### Stochastic Gradient Descent

Helpful Link
- https://scikit-learn.org/stable/modules/sgd.html#regression

In [57]:
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3)
sgd_reg.fit(X_train,y_train)

score = sgd_reg.score(X_test, y_test)

print(score)

0.2055019111876829


In [58]:
print(y_test.values[1])
print(int(sgd_reg.predict([X_test[1]])))

2
0


##### Stochastic Gradient Descent Evaluation

In [59]:
results = sgd_reg.predict(X_test)

for index, value in enumerate(results):
    results[index] = int(value)
print(results)

[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0.
 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1.
 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0.
 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0.
 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0.
 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1.
 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.

In [60]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, results))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.5f'
      % r2_score(y_test, results))

Mean squared error: 0.89
Coefficient of determination: -0.36536


In [61]:
for value in results:
    value = int(value)

results_df = pd.DataFrame({'Actual': y_test, 'Predicted': results})
results_df

Unnamed: 0,Actual,Predicted
672,1,0.0
2380,2,0.0
1183,2,0.0
1602,0,0.0
1055,2,1.0
...,...,...
361,0,0.0
2364,1,0.0
692,2,1.0
1523,1,1.0


### Support Vector Machine

Helpful Links:
- https://scikit-learn.org/stable/modules/svm.html

- https://scikit-learn.org/stable/modules/svm.html#regression

In [62]:
svm_reg = svm.SVR()
svm_reg.fit(X_train, y_train)

score = svm_reg.score(X_test, y_test)

print(score)

0.10102139146839095


In [63]:
print(y_test.values[1])
print(int(svm_reg.predict([X_test[1]])))

2
0


##### Support Vector Machine Evaluation

In [64]:
results = svm_reg.predict(X_test)

for index, value in enumerate(results):
    results[index] = int(value)
print(results)

[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1.
 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1.
 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1.
 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0.
 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.
 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.

In [65]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, results))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.5f'
      % r2_score(y_test, results))

Mean squared error: 0.96
Coefficient of determination: -0.47567


In [66]:
for value in results:
    value = int(value)

results_df = pd.DataFrame({'Actual': y_test, 'Predicted': results})
results_df

Unnamed: 0,Actual,Predicted
672,1,0.0
2380,2,0.0
1183,2,0.0
1602,0,0.0
1055,2,1.0
...,...,...
361,0,1.0
2364,1,1.0
692,2,0.0
1523,1,1.0
