In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
df = pd.read_csv('pastdata.csv')

In [3]:
target = df['position']
features = df.drop(['position','team'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [4]:
y_pred = model.predict(X_test)
y_pred

array([17,  5,  5,  3,  9, 17, 17, 17,  9,  1,  9,  9, 17,  5, 17, 17, 17,
       17,  9,  2, 17, 17,  9, 17, 17, 17,  3,  5,  5,  9, 17, 17])

In [5]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9375

In [7]:
prediction = pd.read_csv('2023data.csv')
prediction = prediction.drop(['team'],axis=1)
y_pred2 = model.predict(prediction)
y_pred2

array([17, 17, 17,  5,  5, 17,  5,  9, 17,  9,  9, 17,  3, 17, 17,  9, 17,
        5, 17,  3,  5,  9,  9, 17,  9, 17,  3, 17, 17, 17,  9, 17])

In [8]:
print("Training columns:", features.columns)
print("Prediction columns:", prediction.columns)


Training columns: Index(['players_used', 'avg_age', 'possession', 'games', 'games_starts',
       'minutes', 'minutes_90s', 'goals', 'assists', 'goals_assists',
       ...
       'offsides', 'pens_won', 'pens_conceded', 'own_goals', 'ball_recoveries',
       'aerials_won', 'aerials_lost', 'aerials_won_pct', 'topGoals', 'season'],
      dtype='object', length=183)
Prediction columns: Index(['players_used', 'avg_age', 'possession', 'games', 'games_starts',
       'minutes', 'minutes_90s', 'goals', 'assists', 'goals_assists',
       ...
       'offsides', 'pens_won', 'pens_conceded', 'own_goals', 'ball_recoveries',
       'aerials_won', 'aerials_lost', 'aerials_won_pct', 'topGoals', 'season'],
      dtype='object', length=183)


In [9]:
prediction

Unnamed: 0,players_used,avg_age,possession,games,games_starts,minutes,minutes_90s,goals,assists,goals_assists,...,offsides,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct,topGoals,season
0,19,27.2,53.2,6,66,540,6.0,11,9,20,...,18,1,2,0,293,55,84,39.6,4,2023
1,22,29.5,56.5,6,66,540,6.0,4,3,7,...,20,1,1,0,309,75,81,48.1,1,2023
2,26,27.2,64.8,6,66,540,6.0,12,10,22,...,8,0,1,0,314,69,70,49.6,5,2023
3,24,27.4,56.1,10,110,900,10.0,21,19,40,...,26,0,1,0,574,105,113,48.2,4,2023
4,24,26.8,52.9,10,110,900,10.0,25,16,41,...,16,4,3,0,537,115,143,44.6,6,2023
5,22,26.8,44.2,6,66,540,6.0,3,1,4,...,4,1,2,0,291,42,43,49.4,2,2023
6,26,27.7,57.3,10,110,900,10.0,12,7,19,...,27,1,0,0,597,116,109,51.6,3,2023
7,25,27.1,45.6,8,88,720,8.0,8,7,15,...,14,2,3,0,389,114,96,54.3,2,2023
8,20,27.7,46.8,6,66,540,6.0,4,2,6,...,2,0,3,1,306,87,72,54.7,2,2023
9,26,26.8,47.9,8,88,720,8.0,11,10,21,...,8,0,2,0,427,68,86,44.2,4,2023


In [10]:
importances = model.feature_importances_

# Create a DataFrame to store the feature importances
feature_importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': importances})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

# Print the feature importances
feature_importance_df.head(20)


Unnamed: 0,Feature,Importance
4,games_starts,0.067956
33,gk_minutes,0.063001
6,minutes_90s,0.059776
32,gk_games_starts,0.058446
5,minutes,0.057128
105,passes_dead,0.041493
31,gk_games,0.039076
88,passes_progressive_distance,0.037739
39,gk_wins,0.037019
177,ball_recoveries,0.033925
