# كتاب خانه هاي استفاده شده :

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from time import time
from sklearn.metrics import f1_score
from os import path, makedirs, walk
from joblib import dump, load
import json

In [24]:
def train_classifier(clf, X_train, y_train):
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    print("Model trained in {:2f} seconds".format(end-start))

In [25]:
def predict_labels(clf, features, target):
    start = time()
    y_pred = clf.predict(features)
    end = time()
    print("Made Predictions in {:2f} seconds".format(end-start))

    acc = sum(target == y_pred) / float(len(y_pred))

    return f1_score(target, y_pred, average='micro'), acc

In [26]:
def model(clf, X_train, y_train, X_test, y_test):
    train_classifier(clf, X_train, y_train)

    f1, acc = predict_labels(clf, X_train, y_train)
    print("Training Info:")
    print("-" * 20)
    print("F1 Score:{}".format(f1))
    print("Accuracy:{}".format(acc))

    f1, acc = predict_labels(clf, X_test, y_test)
    print("Test Metrics:")
    print("-" * 20)
    print("F1 Score:{}".format(f1))
    print("Accuracy:{}".format(acc))

In [27]:
def derive_clean_sheet(src):
    arr = []
    n_rows = src.shape[0]

    for data in range(n_rows):

        #[HTHG, HTAG]
        values = src.iloc[data].values
        cs = [0, 0]

        if values[0] == 0:
            cs[1] = 1

        if values[1] == 0:
            cs[0] = 1

        arr.append(cs)

    return arr

In [28]:
en_data_folder = 'english'
es_data_folder = 'spanish'
fr_data_folder = 'french'
ge_data_folder = 'german'
it_data_folder = 'italian'

In [29]:
# data_folders = [es_data_folder]
data_folders = [en_data_folder, es_data_folder,
                fr_data_folder, ge_data_folder, it_data_folder]

In [30]:
season_range = (9, 18)

data_files = []
for data_folder in data_folders:
    for season in range(season_range[0], season_range[1] + 1):
        data_files.append(
            'data/{}/data/season-{:02d}{:02d}_csv.csv'.format(data_folder, season, season + 1))

In [31]:
data_frames = []

for data_file in data_files:
    if path.exists(data_file):
        data_frames.append(pd.read_csv(data_file))
        
 data = pd.concat(data_frames).reset_index()

## home_encoded = كد نسبت داده شده به تيمي كه در خانه خود بازي ميكند 
## away_encoded=كد نسبت داده شده به تيمي كه در خانه حريف بازي ميكند 
# HTHG= تعداد گل نيمه اول تيمي كه در خانه بازي ميكند
# HTAG=تعداد گل نيمه اول تيمي كه در خانه حريف بازي ميكند
# HS= تعداد شوت هاي تيم ميزبان
# AS= تعداد شوت هاي تيم ميهمان
# HST= شوت در چهارچوب تيم ميزبان
# AST=شوت در چهارچوب تيم ميهمان 
# HR=كارت قرمز ميزبان
# AR=كارت قرمز ميهمان 
# FTR = نتيجه كلي

In [33]:

input_filter = ['home_encoded', 'away_encoded', 'HTHG', 'HTAG', 'HS',
                'AS', 'HST', 'AST', 'HR', 'AR']

output_filter = ['FTR']

cols_to_consider = input_filter + output_filter

In [34]:
encoder = LabelEncoder()
home_encoded = encoder.fit_transform(data['HomeTeam'])
home_encoded_mapping = dict(
    zip(encoder.classes_, encoder.transform(encoder.classes_).tolist()))
data['home_encoded'] = home_encoded

encoder = LabelEncoder()
away_encoded = encoder.fit_transform(data['AwayTeam'])
away_encoded_mapping = dict(
    zip(encoder.classes_, encoder.transform(encoder.classes_).tolist()))
data['away_encoded'] = away_encoded

In [35]:
data = data[cols_to_consider]

print(data[data.isna().any(axis=1)])
data = data.dropna(axis=0)

       home_encoded  away_encoded  HTHG  HTAG    HS    AS  HST  AST   HR   AR  \
10585            16            95   NaN   NaN   NaN   NaN  NaN  NaN  NaN  NaN   
15254            35           129   NaN   NaN   NaN   NaN  NaN  NaN  NaN  NaN   
16757           132           121   NaN   NaN  13.0  15.0  3.0  5.0  0.0  0.0   

      FTR  
10585   A  
15254   A  
16757   A  


In [36]:
X = data[input_filter]
Y = data['FTR']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

svc_classifier = SVC(random_state=100, kernel='rbf')
lr_classifier = LogisticRegression(multi_class='ovr', max_iter=500)
nbClassifier = GaussianNB()
dtClassifier = DecisionTreeClassifier()
rfClassifier = RandomForestClassifier()

In [37]:
print()
print("Logistic Regression one vs All Classifier")
print("-" * 20)
model(lr_classifier, X_train, Y_train, X_test, Y_test)


Logistic Regression one vs All Classifier
--------------------
Model trained in 1.008652 seconds
Made Predictions in 0.003000 seconds
Training Info:
--------------------
F1 Score:0.6577861687993847
Accuracy:0.6577861687993847
Made Predictions in 0.003000 seconds
Test Metrics:
--------------------
F1 Score:0.6708612975391499
Accuracy:0.6708612975391499


In [38]:
print()
print("Gaussain Naive Bayes Classifier")
print("-" * 20)
model(nbClassifier, X_train, Y_train, X_test, Y_test)


Gaussain Naive Bayes Classifier
--------------------
Model trained in 0.019043 seconds
Made Predictions in 0.010999 seconds
Training Info:
--------------------
F1 Score:0.6262499125935249
Accuracy:0.6262499125935249
Made Predictions in 0.001960 seconds
Test Metrics:
--------------------
F1 Score:0.6451342281879194
Accuracy:0.6451342281879194


In [39]:
print()
print("Random Forest Classifier")
print("-" * 20)
model(rfClassifier, X_train, Y_train, X_test, Y_test)


Random Forest Classifier
--------------------
Model trained in 1.361227 seconds
Made Predictions in 0.251048 seconds
Training Info:
--------------------
F1 Score:0.9998601496398853
Accuracy:0.9998601496398853
Made Predictions in 0.075000 seconds
Test Metrics:
--------------------
F1 Score:0.6552013422818792
Accuracy:0.6552013422818792


In [40]:
shouldExport = input('Do you want to export the model(s) (y / n) ? ')
if shouldExport.strip().lower() == 'y':
    exportedModelsPath = 'exportedModels'

    makedirs(exportedModelsPath, exist_ok=True)

    dump(lr_classifier, f'{exportedModelsPath}/lr_classifier.model')
    dump(nbClassifier, f'{exportedModelsPath}/nb_classifier.model')
    dump(rfClassifier, f'{exportedModelsPath}/rf_classifier.model')

    exportMetaData = dict()
    exportMetaData['home_teams'] = home_encoded_mapping
    exportMetaData['away_teams'] = away_encoded_mapping

    exportMetaDataFile = open(f'{exportedModelsPath}/metaData.json', 'w')
    json.dump(exportMetaData, exportMetaDataFile)

    print(f'Model(s) exported successfully to {exportedModelsPath}/')

Do you want to export the model(s) (y / n) ? y
Model(s) exported successfully to exportedModels/


In [41]:
ytest=['A','H','D'];
xtest = pd.DataFrame([[18.0, 164.0, 4.0, 5.0, 12.0,12.0, 4.0, 5.0, 2.0, 2.0],[18.0, 164.0, 4.0, 5.0, 12.0,12.0, 4.0, 5.0, 2.0, 2.0],[18.0, 164.0, 4.0, 5.0, 12.0,12.0, 4.0, 5.0, 2.0, 2.0]],
                    columns=['home_encoded', 'away_encoded', 'HTHG', 'HTAG', 'HS',
                'AS', 'HST', 'AST', 'HR', 'AR'])
model(rfClassifier, X_train, Y_train, xtest, ytest) 

Model trained in 1.394069 seconds
Made Predictions in 0.260047 seconds
Training Info:
--------------------
F1 Score:0.9999300748199427
Accuracy:0.9999300748199427
Made Predictions in 0.010998 seconds
Test Metrics:
--------------------
F1 Score:0.3333333333333333
Accuracy:0.3333333333333333


In [46]:
rfClassifier.predict_proba([[8, 164, 4, 5, 12,12, 4, 5, 2, 2]])

array([[0.4 , 0.27, 0.33]])

In [45]:
lr_classifier.predict_proba([[8, 164, 4, 5, 12,12, 4, 5, 2, 2]])

array([[0.84482601, 0.06563271, 0.08954128],
       [0.84997353, 0.05909177, 0.0909347 ]])