In [6]:
import sys
!{sys.executable} -m pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp314-cp314-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.8.0-cp314-cp314-win_amd64.whl (8.1 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn

   ---------------------------------------- 0/2 [threadpoolctl]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- -


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load the final scouting pool we created
df = pd.read_csv('../data/final_scouting_pool.csv')

# 2. Advanced Feature Selection
# In professional scouting, we normalize 'Age' and 'Gls' to see who outperforms their peer group
scaler = MinMaxScaler()
df[['Age_norm', 'Gls_norm']] = scaler.fit_transform(df[['Age', 'Gls']])

# 3. Create the Similarity Matrix
# This matrix calculates how 'close' every player is to another based on stats
features = ['Age_norm', 'Gls_norm']
similarity_matrix = cosine_similarity(df[features])

print("‚úÖ Professional Scouting Engine: Online")

‚úÖ Professional Scouting Engine: Online


In [9]:
def find_similar_players(player_name, top_n=5):
    try:
        # 1. Flexible Search: Look for any partial match
        player_match = df[df['Player'].str.contains(player_name, case=False, na=False)]
        
        if player_match.empty:
            print(f"‚ùå No player found with name: '{player_name}'")
            # Show some suggestions from the top of the database
            print("üí° Try searching for these available stars: Yan Diomande, Arda Guler, Hamza Igamane")
            return
        
        # In case of multiple matches (e.g., 'Diomande'), take the first one
        player_index = player_match.index[0]
        actual_name = player_match.iloc[0]['Player']
        
        # 2. Extract similarity scores from our matrix
        similar_scores = list(enumerate(similarity_matrix[player_index]))
        
        # 3. Rank alternatives
        sorted_profiles = sorted(similar_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
        
        print(f"üìä SCOUTING REPORT FOR: {actual_name.upper()}")
        print("="*60)
        
        for i, score in sorted_profiles:
            candidate = df.iloc[i]
            print(f"‚≠ê Match: {score:.1%} | {candidate['Player']} ({candidate['Squad']})")
            print(f"   Age: {candidate['Age']} | Goals: {candidate['Gls']} | Nation: {candidate['Nation']}")
            print("-" * 40)
            
    except Exception as e:
        print(f"‚ö†Ô∏è Tactical Error: {e}")

# Try these now:
find_similar_players('Yan')

üìä SCOUTING REPORT FOR: OLADAPO AFOLAYAN
‚≠ê Match: 100.0% | Ali Abdi (Nice)
‚ö†Ô∏è Tactical Error: 'Nation'
