Exploração dos dados

In [37]:
import pandas as pd
import numpy as np

# Carregando o conjunto de dados
data = pd.read_csv('./matches_brasileirao_serie_a_2022.csv')

# verficando a quantidade de instâncias
instances = data.shape[0]
print('Número de instâncias: ' + str(instances))

# verificando a quantidade de atributos
attributes = data.shape[1]
print('Número de atributos: ' + str(attributes))

# Verificando a presença de dados ausentes
print(data.isna().sum())


Número de instâncias: 380
Número de atributos: 26
match_id                    0
stage                       0
date                        0
team_name_home              0
team_name_away              0
team_home_score             0
team_away_score             0
possession_home             0
possession_away             0
total_shots_home            0
total_shots_away            0
shots_on_target_home        0
shots_on_target_away        0
duels_won_home              0
duels_won_away              0
prediction_team_home_win    0
prediction_draw             0
prediction_team_away_win    0
prediction_quantity         0
location                    0
lineup_home                 0
lineup_away                 0
player_names_home           0
player_numbers_home         0
player_names_away           0
player_numbers_away         0
dtype: int64


Criando variáveis

In [38]:
# Separando atributos desejáveis
parsed_data = data.drop(
  labels=['stage', 'match_id', 'date', 'location', 'lineup_home', 'lineup_away', 'player_names_home', 'player_numbers_home', 'player_names_away', 'player_numbers_away', 'prediction_team_home_win', 'prediction_draw', 'prediction_team_away_win', 'prediction_quantity'],
  axis=1)

parsed_data['home_wins'] = np.where((parsed_data['team_home_score'] > parsed_data['team_away_score']), 1, 0)
parsed_data['both_scores_nonzero'] = np.where((parsed_data['team_home_score'] != 0) & parsed_data['team_away_score']!=0), 1, 0)

parsed_data

Unnamed: 0,team_name_home,team_name_away,team_home_score,team_away_score,possession_home,possession_away,total_shots_home,total_shots_away,shots_on_target_home,shots_on_target_away,duels_won_home,duels_won_away,home_wins,both_scores_nonzero
0,América MG,Atlético GO,1,1,0.60,0.40,36,4,9,3,0.48,0.52,0,1
1,RB Bragantino,Fluminense,0,1,0.41,0.59,13,8,4,4,0.54,0.46,0,0
2,Internacional,Palmeiras,3,0,0.48,0.52,15,11,8,2,0.58,0.42,1,0
3,Goiás,São Paulo,0,4,0.35,0.65,6,13,2,7,0.62,0.38,0,0
4,Cuiabá,Coritiba,2,1,0.57,0.43,17,4,8,2,0.51,0.49,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,Atl. Mineiro,Internacional,2,0,0.52,0.48,15,6,5,1,0.51,0.49,1,0
376,Coritiba,Goiás,3,0,0.52,0.48,17,7,9,0,0.48,0.52,1,0
377,Palmeiras,Ceará,2,3,0.56,0.44,17,19,4,9,0.42,0.59,0,1
378,Atlético GO,Flamengo,1,1,0.44,0.56,10,12,3,3,0.52,0.48,0,1


Dividindo os dados em um conjunto de treinamento e testes (80% por 20%)

In [39]:
from sklearn.model_selection import train_test_split

# Divisão dos dados em conjunto de treinamento e teste
train_data, test_data = train_test_split(parsed_data, test_size=0.2, random_state=42)

# Verificando o tamanho dos conjuntos de treinamento e teste
print("Tamanho do conjunto de treinamento:", len(train_data))
print("Tamanho do conjunto de teste:", len(test_data))

Tamanho do conjunto de treinamento: 304
Tamanho do conjunto de teste: 76


Definindo o algotimo de treinamento

In [87]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Definindo as variáveis de entrada (features) e o alvo (target)
features = ['possession_home','possession_away','total_shots_home','total_shots_away','shots_on_target_home','shots_on_target_away','duels_won_home','duels_won_away']
target = 'both_scores_nonzero'

# Separando as features e o alvo do conjunto de treinamento e teste
X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]


####################################


# Criando e treinando o modelo Decision Tree
model = DecisionTreeClassifier()
model.fit(X_train, y_train)


# Fazendo previsões no conjunto de teste
y_pred = model.predict(X_test)

# Calculando a acurácia do modelo
accuracy = accuracy_score(y_test, y_pred)
print("Acurácia do modelo:", accuracy)

################################

from sklearn.ensemble import RandomForestClassifier

# Criando e treinando o modelo Random Forest
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

# Fazendo previsões no conjunto de teste
y_pred_rf = model_rf.predict(X_test)

# Calculando a acurácia do modelo Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Acurácia do modelo Random Forest:", accuracy_rf)

################################


from sklearn.linear_model import LogisticRegression

# Criando e treinando o modelo Logistic Regression
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

# Fazendo previsões no conjunto de teste
y_pred_lr = model_lr.predict(X_test)

# Calculando a acurácia do modelo Logistic Regression
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Acurácia do modelo Logistic Regression:", accuracy_lr)

######################################

from sklearn.svm import SVC

# Criando e treinando o modelo SVM
model_svm = SVC()
model_svm.fit(X_train, y_train)

# Fazendo previsões no conjunto de teste
y_pred_svm = model_svm.predict(X_test)

# Calculando a acurácia do modelo SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Acurácia do modelo SVM:", accuracy_svm)


##########################################

from sklearn.naive_bayes import GaussianNB

# Criando e treinando o modelo Naive Bayes
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)

# Fazendo previsões no conjunto de teste
y_pred_nb = model_nb.predict(X_test)

# Calculando a acurácia do modelo Naive Bayes
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Acurácia do modelo Naive Bayes:", accuracy_nb)


##############################


from sklearn.ensemble import GradientBoostingClassifier

# Criando e treinando o modelo Gradient Boosting
model_gb = GradientBoostingClassifier()
model_gb.fit(X_train, y_train)

# Fazendo previsões no conjunto de teste
y_pred_gb = model_gb.predict(X_test)

# Calculando a acurácia do modelo Gradient Boosting
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Acurácia do modelo Gradient Boosting:", accuracy_gb)



Acurácia do modelo: 0.5868421052631579
Acurácia do modelo Random Forest: 0.5868421052631579
Acurácia do modelo Logistic Regression: 0.6421052631578947
Acurácia do modelo SVM: 0.6105263157894737
Acurácia do modelo Naive Bayes: 0.6157894736842106
Acurácia do modelo Gradient Boosting: 0.5921052631578947


Validando o modelo

In [86]:
from sklearn.model_selection import cross_val_score

# Aplicando validação cruzada
scores = cross_val_score(model, X_train, y_train, cv=5)

# Exibindo a acurácia média e desvio padrão
print("DecisionTree")
print("Acurácia média da validação cruzada :", scores.mean())
print("Desvio padrão da validação cruzada:", scores.std())

#####################

# Aplicando validação cruzada
scores = cross_val_score(model_lr, X_train, y_train, cv=5)

# Exibindo a acurácia média e desvio padrão
print("LogisticRegression")
print("Acurácia média da validação cruzada:", scores.mean())
print("Desvio padrão da validação cruzada:", scores.std())



DecisionTree
Acurácia média da validação cruzada : 0.5692349726775956
Desvio padrão da validação cruzada: 0.03911423345002877
LogisticRegression
Acurácia média da validação cruzada: 0.6579781420765027
Desvio padrão da validação cruzada: 0.07394886348889503


In [67]:
# Dados de teste ou novos dados para prever
new_data = test_data 

# Aplicando as mesmas etapas de pré-processamento nas novas entradas
# ...
# Pré-processamento dos novos dados

# Realizando as previsões
new_features = new_data[features]
predictions = model.predict(new_features)

# Exibindo as previsões
print("Previsões:")
#for i, pred in enumerate(predictions):    
#    print("Amostra", i+1, ": Ambos marcam" if pred == 1 else "Ambos não marcam")

new_data['prediction_result'] = predictions
new_data['correct_prediction'] = np.where((new_data['prediction_result'] == new_data['both_scores_nonzero']), 1, 0)


correct_prediction_count = new_data['correct_prediction'].cumsum()
new_data['correct_prediction_count'] = correct_prediction_count

final_correct_count = new_data['correct_prediction_count'].iloc[-1]

final_correct_count
#new_data

#predictions.size

Previsões:


207

In [68]:
data2 = pd.read_csv('./matches_brasileirao_serie_b_2022.csv')
parsed_data2 = data2.drop(
  labels=['team_name_home', 'team_name_away', 'stage', 'match_id', 'date', 'location', 'lineup_home', 'lineup_away', 'player_names_home', 'player_numbers_home', 'player_names_away', 'player_numbers_away', 'prediction_team_home_win', 'prediction_draw', 'prediction_team_away_win', 'prediction_quantity'],
  axis=1)

parsed_data2['home_wins'] = np.where((parsed_data2['team_home_score'] > parsed_data2['team_away_score']), 1, 0)
parsed_data2['both_scores_nonzero'] = np.where((parsed_data2['team_home_score'] != 0) & (parsed_data2['team_away_score']!=0), 1, 0)

parsed_data2 = parsed_data2.drop(
  labels=['team_home_score', 'team_away_score'],
  axis=1)


correlations = parsed_data2.corr()['both_scores_nonzero']




correlations_sorted = correlations.sort_values(ascending=False)
print(correlations_sorted)


both_scores_nonzero     1.000000
shots_on_target_away    0.296284
total_shots_away        0.095482
shots_on_target_home    0.084243
possession_away         0.068087
duels_won_home          0.008946
total_shots_home        0.000170
duels_won_away         -0.008904
possession_home        -0.068087
home_wins              -0.186371
Name: both_scores_nonzero, dtype: float64


In [69]:

# Fazendo previsões no conjunto de teste
predictions2 = model.predict(parsed_data2[features])

# Calculando a acurácia do modelo
accuracy = accuracy_score(parsed_data2[target], predictions2)
print("Acurácia do modelo:", accuracy)

Acurácia do modelo: 0.5447368421052632


In [70]:
# Exibindo as previsões
print("Previsões:")
#for i, pred in enumerate(predictions):    
#    print("Amostra", i+1, ": Ambos marcam" if pred == 1 else "Ambos não marcam")

parsed_data2['prediction_result'] = predictions2
parsed_data2['correct_prediction'] = np.where((parsed_data2['prediction_result'] == parsed_data2['both_scores_nonzero']), 1, 0)


correct_prediction_count = parsed_data2['correct_prediction'].cumsum()
parsed_data2['correct_prediction_count'] = correct_prediction_count

final_correct_count = parsed_data2['correct_prediction_count'].iloc[-1]

parsed_data2

#predictions.size

Previsões:


Unnamed: 0,possession_home,possession_away,total_shots_home,total_shots_away,shots_on_target_home,shots_on_target_away,duels_won_home,duels_won_away,home_wins,both_scores_nonzero,prediction_result,correct_prediction,correct_prediction_count
0,0.66,0.34,24,16,9,4,0.56,0.44,1,1,0,0,0
1,0.60,0.40,7,8,3,4,0.49,0.52,0,0,0,1,1
2,0.64,0.36,22,11,6,6,0.48,0.52,0,0,1,0,1
3,0.46,0.54,15,8,3,0,0.55,0.45,0,0,0,1,2
4,0.47,0.53,6,16,2,5,0.51,0.49,0,1,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,0.45,0.55,7,10,0,2,0.49,0.51,0,0,0,1,204
376,0.52,0.48,13,15,2,3,0.49,0.51,0,1,1,1,205
377,0.38,0.62,14,11,3,1,0.49,0.52,1,0,0,1,206
378,0.61,0.39,12,16,4,5,0.49,0.51,0,1,1,1,207
