<a href="https://colab.research.google.com/github/Ayush1757/MovieRecommendor1757/blob/main/Football_FAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score


In [18]:
df = pd.read_csv('/content/football_players.csv', encoding='cp1252')

In [19]:
df.head()

Unnamed: 0,Name,Current Team,Position,Country,Age,Previos Club,Salary,no. of trophies,current league,old clubs
0,Lionel Messi,Inter Miami,Forward,Argentina,37,Paris Saint-Germain,$65M,40,MLS,"Newell's Old Boys, Barcelona, PSG"
1,Cristiano Ronaldo,Al Nassr,Forward,Portugal,39,Manchester United,$75M,40,Saudi Pro League,"Sporting CP, Manchester United, Real Madrid, J..."
2,Kylian Mbapp‚,Paris Saint-Germain,Forward,France,25,AS Monaco,$110M,25,Ligue 1,"Monaco, PSG"
3,Erling Haaland,Manchester City,Forward,Norway,23,Borussia Dortmund,$30M,10,Premier League,"Bryne FK, Molde, RB Salzburg, Borussia Dortmund"
4,Robert Lewandowski,Barcelona,Forward,Poland,36,Bayern Munich,$22M,30,La Liga,"Znicz Pruszk¢w, Lech Pozna?, Borussia Dortmund..."


In [38]:
# Encode categorical columns using LabelEncoder
label_encoder = LabelEncoder()

# Encoding the categorical variables
df['Position'] = label_encoder.fit_transform(df['Position'])
df['Country'] = label_encoder.fit_transform(df['Country'])
df['Current Team'] = label_encoder.fit_transform(df['Current Team'])

# Check if 'Previous Club' column exists before encoding
if 'Previous Club' in df.columns:
    # If the column exists, apply Label Encoding
    df['Previous Club'] = label_encoder.fit_transform(df['Previous Club'])
else:
    # If the column is not found, print a message or handle it as needed
    print("Column 'Previous Club' not found in the DataFrame.")

# If the 'Salary' column is not numeric, we should convert it
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

# Check for missing values and handle them (e.g., replacing with mean)
# Only include numeric features for calculating the mean
numeric_features = df.select_dtypes(include=np.number).columns
df[numeric_features] = df[numeric_features].fillna(df[numeric_features].mean())

# Display the updated DataFrame
df.head()

Column 'Previous Club' not found in the DataFrame.


Unnamed: 0,Name,Current Team,Position,Country,Age,Previos Club,Salary,no. of trophies,current league,old clubs
0,Lionel Messi,6,1,0,37,Paris Saint-Germain,,40,MLS,"Newell's Old Boys, Barcelona, PSG"
1,Cristiano Ronaldo,0,1,11,39,Manchester United,,40,Saudi Pro League,"Sporting CP, Manchester United, Real Madrid, J..."
2,Kylian Mbapp‚,9,1,6,25,AS Monaco,,25,Ligue 1,"Monaco, PSG"
3,Erling Haaland,8,1,9,23,Borussia Dortmund,,10,Premier League,"Bryne FK, Molde, RB Salzburg, Borussia Dortmund"
4,Robert Lewandowski,3,1,10,36,Bayern Munich,,30,La Liga,"Znicz Pruszk¢w, Lech Pozna?, Borussia Dortmund..."


In [41]:
# Select features and target
# Ensure correct column name for 'Previous Club'
# Check if a column containing 'Previous' exists
previous_club_col = df.columns[df.columns.str.contains('Previous', case=False)]

# If such a column exists, use it; otherwise, handle the case (e.g., skip the column)
if len(previous_club_col) > 0:
    X = df[['Country', 'Age', 'Salary', 'Current Team', previous_club_col[0]]]
else:
    print("Column containing 'Previous' not found. Excluding from features.")
    X = df[['Country', 'Age', 'Salary', 'Current Team']]

y = df['Position']  # Target (Position)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the datasets
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Column containing 'Previous' not found. Excluding from features.
Training data shape: (16, 4)
Testing data shape: (4, 4)


In [42]:
# Initialize and train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report for more details
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.50

Classification Report:
              precision    recall  f1-score   support

           1       0.50      0.50      0.50         2
           3       0.50      0.50      0.50         2

    accuracy                           0.50         4
   macro avg       0.50      0.50      0.50         4
weighted avg       0.50      0.50      0.50         4



In [43]:
# Perform cross-validation to check model's performance across multiple splits
cv_scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Average cross-validation score: {cv_scores.mean():.2f}")



Cross-validation scores: [0.25 0.   0.25 0.5  0.5 ]
Average cross-validation score: 0.30


In [48]:
from sklearn.model_selection import GridSearchCV
# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)

# Fit GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.2f}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits




Best parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Best score: 0.51
