In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import ace_tools as tools

# Load the dataset
file_path = "/mnt/data/Final football player dataset.xlsx"
df = pd.read_excel(file_path)

# Exploratory Data Analysis (EDA)
# Checking basic statistics
eda_summary = df.describe()

# Visualize correlations between performance metrics and wages
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Histograms of key features
df.hist(bins=30, figsize=(15, 10))
plt.suptitle('Histograms of Key Features')
plt.show()

# Data Preparation for the Model
# Assuming 'Stay/Look for Option' as the target variable (Needs to be defined in the dataset)
df['Target'] = df.apply(lambda row: 1 if row['Performance Metric'] > threshold else 0, axis=1) # Define threshold logic

# Selecting features for the model
features = ['Performance Metric', 'Wages', 'Age', 'Position Code'] # Adjust features accordingly
X = df[features]
y = df['Target']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)

# Output evaluation metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Making predictions on the full dataset
df['Prediction'] = model.predict(X)

# Mapping predictions to human-readable form
df['Future'] = df['Prediction'].apply(lambda x: 'Stay' if x == 1 else 'Look for Better Option')

# Output to Excel
output_df = df[['Player Name', 'Position', 'Team', 'Future']]
output_file = "/mnt/data/Player_Future_Predictions.xlsx"
output_df.to_excel(output_file, index=False)

# Display the EDA summary to the user
tools.display_dataframe_to_user(name="EDA Summary", dataframe=eda_summary)

ModuleNotFoundError: No module named 'ace_tools'