In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('final_data.csv')

# Drop rows with any NA values
data = data.dropna()

In [3]:
data.head()

Unnamed: 0,player,team,name,position,height,age,appearance,goals,assists,yellow cards,...,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
0,/david-de-gea/profil/spieler/59377,Manchester United,David de Gea,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,...,1.217252,0.335463,9390,42,5,13,15000000,70000000,1,0
1,/jack-butland/profil/spieler/128899,Manchester United,Jack Butland,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,...,1.242331,0.207055,1304,510,58,1,1500000,22000000,1,0
2,/tom-heaton/profil/spieler/34130,Manchester United,Tom Heaton,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,...,0.616438,0.924658,292,697,84,4,600000,6000000,1,0
3,/lisandro-martinez/profil/spieler/480762,Manchester United,Lisandro Martínez,Defender Centre-Back,175.0,25.0,82,0.02809,0.05618,0.224719,...,0.0,0.0,6408,175,22,9,50000000,50000000,2,0
4,/raphael-varane/profil/spieler/164770,Manchester United,Raphaël Varane,Defender Centre-Back,191.0,30.0,63,0.017889,0.017889,0.053667,...,0.0,0.0,5031,238,51,21,40000000,80000000,2,0


In [4]:
data.drop(columns = ['player', 'team', 'name'], inplace = True)

In [5]:
low_threshold = data['current_value'].quantile(0.35)
mid_threshold = data['current_value'].quantile(0.75)


def categorize_price(value):
    if value < low_threshold:
        return 'cheap price'
    elif value < mid_threshold:
        return 'regular'
    else:
        return 'expensive'

data['price_category'] = data['current_value'].apply(categorize_price)

In [6]:
# Assuming the target variable is named 'target'
X = data.drop(['current_value', 'price_category'], axis=1)
y = data['price_category']

X_encoded = pd.get_dummies(X, drop_first=True)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [7]:
dt_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Random Forest parameters
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

In [8]:
dt_model = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_params, cv=5, n_jobs=-1)
dt_model.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_model.predict(X_test)

# Print classification report
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

Decision Tree Classification Report:
              precision    recall  f1-score   support

 cheap price       0.90      0.86      0.88       756
   expensive       0.87      0.82      0.85       513
     regular       0.79      0.85      0.82       882

    accuracy                           0.85      2151
   macro avg       0.85      0.84      0.85      2151
weighted avg       0.85      0.85      0.85      2151



In [9]:
rf_model = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Print classification report
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

 cheap price       0.97      0.81      0.88       756
   expensive       0.86      0.86      0.86       513
     regular       0.79      0.90      0.84       882

    accuracy                           0.86      2151
   macro avg       0.87      0.86      0.86      2151
weighted avg       0.87      0.86      0.86      2151

