 # Preprocessing


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset
df = pd.read_csv('kl.csv', encoding='ISO-8859-1')

# Drop irrelevant or highly null columns
df.dropna(thresh=len(df) * 0.7, axis=1, inplace=True)  # drop columns with >30% missing

# Fill or drop remaining nulls
df.fillna(df.median(numeric_only=True), inplace=True)

# Encode categorical features
#le = LabelEncoder()
#if 'position' in df.columns:
#  df['position'] = le.fit_transform(df['position'].astype(str))

# Scale numeric features
#numeric_cols = df.select_dtypes(include='number').columns
#scaler = StandardScaler()
#df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Save processed version
df.to_csv('processed_kl.csv', index=False)
print("✅ Preprocessing complete. Data saved as 'processed_kl.csv'")


✅ Preprocessing complete. Data saved as 'processed_kl.csv'


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_dataset(path="kl.csv"):
    df = pd.read_csv(path, encoding='ISO-8859-1')

    # Drop columns with too many missing values
    df.dropna(thresh=int(0.9 * df.shape[1]), inplace=True)
    df.fillna(df.median(numeric_only=True), inplace=True)

    # Encode important categorical columns
    for col in ['Preferred Foot', 'Work Rate', 'Position']:
        if col in df.columns:
            df[col] = LabelEncoder().fit_transform(df[col].astype(str))

    return df


# Predict Player's Overall Rating (Regression)

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Assuming preprocess_dataset() is already defined and returns a DataFrame
df = preprocess_dataset()

# Feature and target selection
X = df.select_dtypes(include=[np.number]).drop(columns=['Overall'], errors='ignore')
y = df['Overall']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

# Store results
results = {}

# Train, predict, and evaluate each model
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results[name] = {'RMSE': rmse, 'R² Score': r2}

# Display results
print("📊 Comparison of Regression Models:")
results_df = pd.DataFrame(results).T
print(results_df)

📊 Comparison of Regression Models:
                       RMSE  R² Score
Linear Regression  1.159206  0.971194
Ridge Regression   1.157852  0.971261
Lasso Regression   1.231444  0.967492
Random Forest      0.055156  0.999935


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd

# Load dataset
df = pd.read_csv('processed_kl.csv')

# Drop non-numeric features and keep name
df = df.select_dtypes(include=[np.number]).join(df['Name'])

# Separate features and target
X = df.drop(columns=['Overall', 'Name'])
y = df['Overall']
names = df['Name']

# Train/test split
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, random_state=42
)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
predictions = model.predict(X_test)

# Results
results = pd.DataFrame({
    'Player Name': name_test.values,
    'Actual Rating': y_test.values,
    'Predicted Rating': predictions.round(1)
})

# Sort by Predicted Rating (highest first)
results = results.sort_values(by='Predicted Rating', ascending=False)

# Show top 10 results
print(results.head(10).to_string(index=False))

# Search functionality
search_name = input("\n🔍 Enter a player's name to view their rating: ").strip()

# Case-insensitive search
search_results = results[results['Player Name'].str.lower() == search_name.lower()]

if not search_results.empty:
    print("\n✅ Player found:")
    print(search_results.to_string(index=False))
else:
    print("\n❌ Player not found in test data.")

 Player Name  Actual Rating  Predicted Rating
    L. Messi           94.0              88.1
      De Gea           91.0              86.9
Sergio Ramos           91.0              86.0
     Marcelo           88.0              85.4
   G. Buffon           88.0              85.0
 T. Courtois           89.0              84.6
  M. Hummels           88.0              84.4
       Piqué           87.0              84.3
    D. Alaba           85.0              83.7
  C. Eriksen           88.0              83.6

🔍 Enter a player's name to view their rating: Lato

✅ Player found:
Player Name  Actual Rating  Predicted Rating
       Lato           74.0              74.2


# Top Talent Identification with Position-Based Classification

In [None]:
# Position code mapping
position_number_to_code = {
    20: 'RF',  # Right Forward
    25: 'ST',  # Striker
    13: 'LW',  # Left Winger
    18: 'RCM',  # Right Central Midfielder
    10: 'LF',  # Left Forward
    22: 'RS',  # Right Striker
    17: 'RCB',  # Right (off-centre) Centre Back
    8: 'LCM',  # Left Central Midfielder
    1: 'CB',  # Center Back
    9: 'LDM',  # Left Defensive Midfielder
    0: 'CAM',  # Central Attacking Midfielder
    2: 'CDM',  # Central Defensive Midfielder
    12: 'LS',  # Left Striker
    7: 'LCB',  # Left (off-centre) Centre Back
    21: 'RM',  # Right Midfielder
    5: 'LAM',  # Left Attacking Midfielder
    11: 'LM',  # Left Midfielder
    6: 'LB',  # Left Back
    19: 'RDM',  # Right Defensive Midfielder
    23: 'RW',  # Right Winger
    4: 'CM',  # Central Midfielder
    16: 'RB',  # Right Back
}
position_code_to_name = {
    'GK': 'Goalkeeper',
    'RB': 'Right Back',
    'LB': 'Left Back',
    'CB': 'Center Back',
    'RCB': 'Right Center Back',
    'LCB': 'Left Center Back',
    'CDM': 'Central Defensive Midfielder',
    'LDM': 'Left Defensive Midfielder',
    'RDM': 'Right Defensive Midfielder',
    'CM': 'Central Midfielder',
    'RCM': 'Right Central Midfielder',
    'LCM': 'Left Central Midfielder',
    'CAM': 'Central Attacking Midfielder',
    'LAM': 'Left Attacking Midfielder',
    'RM': 'Right Midfielder',
    'LM': 'Left Midfielder',
    'RW': 'Right Winger',
    'LW': 'Left Winger',
    'RF': 'Right Forward',
    'LF': 'Left Forward',
    'RS': 'Right Striker',
    'LS': 'Left Striker',
    'CF': 'Center Forward',
    'ST': 'Striker'
}

# Identify top players
top_players = df[df['Overall'] > 85]

# Loop through each unique numeric position
for pos_num in top_players['Position'].unique():
    subset = top_players[top_players['Position'] == pos_num]

    # Get code and full name
    pos_code = position_number_to_code.get(pos_num, "Unknown Code")
    pos_name = position_code_to_name.get(pos_code, "Unknown Position")

    print(f"\n🏅 Top Players in Position {pos_num}:\n", subset[['Name', 'Overall']])


🏅 Top Players in Position RF:
           Name  Overall
0     L. Messi     94.0
50  D. Mertens     87.0

🏅 Top Players in Position ST:
                  Name  Overall
1   Cristiano Ronaldo     94.0
10     R. Lewandowski     90.0
16            H. Kane     89.0
23          S. Agüero     89.0
36            G. Bale     88.0
43          M. Icardi     87.0
47          R. Lukaku     87.0
48        C. Immobile     87.0

🏅 Top Players in Position LW:
           Name  Overall
2    Neymar Jr     92.0
29  L. Insigne     88.0
30        Isco     88.0
32    Coutinho     88.0
55     L. Sané     86.0

🏅 Top Players in Position GK:
              Name  Overall
3          De Gea     91.0
9        J. Oblak     90.0
19    T. Courtois     89.0
22       M. Neuer     89.0
37      H. Lloris     88.0
40  S. Handanovi?     88.0
41      G. Buffon     88.0
46       K. Navas     87.0
57        Ederson     86.0

🏅 Top Players in Position RCM:
             Name  Overall
4   K. De Bruyne     91.0
6      L. Modri?     9

# Similarity-Based Player Recommender

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('processed_kl.csv')
features = df.select_dtypes(include='number').drop(columns=['Overall'])  # Exclude label

def recommend_similar_player(player_name):
    if player_name not in df['Name'].values:
        return "Player not found!"

    index = df[df['Name'] == player_name].index[0]
    similarities = cosine_similarity([features.iloc[index]], features)[0]
    df['similarity'] = similarities
    similar_players = df.sort_values('similarity', ascending=False)

    return similar_players[['Name', 'similarity']].iloc[1:6]  # skip self

# Example:
print(recommend_similar_player('Cristiano Ronaldo'))


                Name  similarity
153         Quaresma    0.999901
224       Pepe Reina    0.998376
109   Z. Ibrahimovi?    0.998249
1302           Danny    0.997926
411          Joaquín    0.997849


# Optimal Team Builder (with Remove/Add Players Option)

In [None]:
import pandas as pd

df = pd.read_csv('processed_kl.csv')
team_size = 11

def build_optimal_team(remove_list=[], add_list=[]):
    team = []

    # Filter out removed players
    available_players = df[~df['Name'].isin(remove_list)]

    # Add custom players first
    for player in add_list:
        if player in df['Name'].values:
            team.append(df[df['Name'] == player].iloc[0])

    # Fill remaining slots with top-rated available players
    remaining_slots = team_size - len(team)
    top_players = available_players.sort_values('Overall', ascending=False)

    for _, row in top_players.iterrows():
        if len(team) >= team_size:
            break
        if row['Name'] not in [p['Name'] for p in team]:
            team.append(row)

    return pd.DataFrame(team)

# Call the function
print("\n🏆 Optimal Team Formation:")
my_team = build_optimal_team()
print(my_team[['Name', 'Overall', 'Position']])

my_new_team = build_optimal_team(remove_list=["Cristiano Ronaldo"], add_list=["Kylian Mbappé"])
print("\nUpdated Team:")
print(my_new_team[['Name', 'Overall', 'Position']])



🏆 Optimal Team Formation:
                 Name  Overall Position
1   Cristiano Ronaldo     94.0       ST
0            L. Messi     94.0       RF
2           Neymar Jr     92.0       LW
7           L. Suárez     91.0       RS
8        Sergio Ramos     91.0      RCB
3              De Gea     91.0       GK
6           L. Modri?     91.0      RCM
5           E. Hazard     91.0       LF
4        K. De Bruyne     91.0      RCM
11           T. Kroos     90.0      LCM
10     R. Lewandowski     90.0       ST

Updated Team:
              Name  Overall Position
0         L. Messi     94.0       RF
2        Neymar Jr     92.0       LW
5        E. Hazard     91.0       LF
8     Sergio Ramos     91.0      RCB
3           De Gea     91.0       GK
6        L. Modri?     91.0      RCM
4     K. De Bruyne     91.0      RCM
7        L. Suárez     91.0       RS
10  R. Lewandowski     90.0       ST
9         J. Oblak     90.0       GK
11        T. Kroos     90.0      LCM
