In [None]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [None]:
pd.set_option('display.max_columns', None)  # Show all columns

In [None]:
df_train = pd.read_csv('/kaggle/input/transfer-value-prediction/train.csv')
print(df_train.shape)
df_train.head(4)

In [None]:
df_train.columns

In [None]:
df_test = pd.read_csv('/kaggle/input/transfer-value-prediction/test.csv')
print(df_test.shape)
df_test.sample()

In [None]:
df_test.columns

In [None]:
df_submission = pd.read_csv('/kaggle/input/transfer-value-prediction/sample_submission.csv')
print(df_submission.shape)
df_submission.head(4)

# DF Analysis

In [None]:
df_train.info()

In [None]:
df_train.nunique()

In [None]:
df_train.corr()

In [None]:
correlation_matrix = df_train.corr()
threshold = 0.70

for col1 in correlation_matrix.columns:
    for col2 in correlation_matrix.index:
        if col1 != col2 and correlation_matrix.loc[col1, col2] > threshold:
            print(f"Correlation between '{col1}' and '{col2}': {correlation_matrix.loc[col1, col2]}")


In [None]:
df_train_clean = df_train.dropna()
df_train_clean = df_train_clean.drop(['Name', 'Country', 'id'], axis=1)

In [None]:
df_train_clean.sample()

In [None]:
df_test.sample()

# Handle Missing Values

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=20)
df_imputed = imputer.fit_transform(df_train_clean)

df_train_clean = pd.DataFrame(df_imputed, columns=df_train_clean.columns)

In [None]:
df_test_clean = df_test.drop(['id', 'Country'], axis=1)

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=20)
df_imputed = imputer.fit_transform(df_test_clean)

df_test_clean = pd.DataFrame(df_imputed, columns=df_test_clean.columns)

# Modelling

In [None]:
df_test_clean.columns

In [None]:
X= df_train_clean.drop(['Value at beginning of 2023/24 season'], axis=1)
y = df_train_clean['Value at beginning of 2023/24 season']

In [None]:
# X_norm = (X - X.mean()) / X.std()
# y_norm = (y - y.mean()) / y.std()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=28)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Normalize features
X_train_normalized = (X_train - X_train.mean()) / X_train.std()
X_test_normalized = (X_test - X_train.mean()) / X_train.std()

# Normalize target values
y_train_normalized = (y_train - y_train.mean()) / y_train.std()
y_test_normalized = (y_test - y_train.mean()) / y_train.std()


In [None]:
# degrees = range(1, 6)  # Try polynomial degrees from 1 to 10
# mse_scores = []

# for degree in degrees:
#     poly = PolynomialFeatures(degree=degree)
#     X_train_poly = poly.fit_transform(X_train)
#     X_test_poly = poly.transform(X_test)

#     model = LinearRegression()
#     model.fit(X_train_poly, y_train)
#     y_pred = model.predict(X_test_poly)

#     mse = mean_squared_error(y_test, y_pred)
#     mse_scores.append(mse)

# best_degree = degrees[np.argmin(mse_scores)]
# print(f"Best polynomial degree: {best_degree}")


In [None]:
# plt.figure(figsize=(10, 6))
# plt.plot(degrees, mse_scores, marker='o')
# plt.title('MSE vs. Polynomial Degree')
# plt.xlabel('Polynomial Degree')
# plt.ylabel('Mean Squared Error')
# plt.xticks(degrees)
# plt.grid(True)
# plt.show()


In [None]:
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score

# model = LinearRegression()

# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# rmse = mse ** 0.5
# r2 = r2_score(y_test, y_pred)

# print(f"Mean Squared Error (MSE): {mse}")
# print(f"Root Mean Squared Error (RMSE): {rmse}")
# print(f"R-squared (R2): {r2}")


# SVR

In [None]:
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error

# model = SVR(kernel='linear', C=1.0, epsilon=0.1 )
# model.fit(X_train_normalized, y_train_normalized)
# y_pred_normalized = model.predict(X_test_normalized)

# mse = mean_squared_error(y_test_normalized, y_pred_normalized)
# print(f"Mean Squared Error: {mse}")


# Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

ridge_model = Ridge(solver='saga')

param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
              'solver':['auto','sag','saga'],
              
             }

grid_search = GridSearchCV(ridge_model, param_grid, cv=10, scoring='neg_mean_squared_error')

grid_search.fit(X_train_normalized, y_train_normalized)

best_ridge_model = grid_search.best_estimator_
model =  best_ridge_model
y_pred_normalized = model.predict(X_test_normalized)

mse = mean_squared_error(y_test_normalized, y_pred_normalized)
print(f"Mean Squared Error: {mse}")


In [None]:
correlation_coefficient = np.corrcoef(y_pred_normalized, y_test_normalized)[0, 1]
print(f"Correlation Coefficient: {correlation_coefficient}")

In [None]:
y_pred_test = (y_pred_normalized * y_train.std()) + y_train.mean()

In [None]:
correlation_coefficient = np.corrcoef(y_pred_test, y_test)[0, 1]
print(f"Correlation Coefficient: {correlation_coefficient}")

# Prediction on Test data

In [None]:
X_test_norm = (df_test_clean - X_train.mean()) / X_train.std()

In [None]:
y_pred_norm = model.predict(X_test_norm)

In [None]:
y_pred = (y_pred_norm * y_train.std()) + y_train.mean()

In [None]:
y_pred.shape

In [None]:
df_test.shape

In [None]:
result_df = pd.DataFrame({
    'id': df_test['id'],
    'label': y_pred.flatten()
})
result_df.head()

In [None]:
result_df.to_csv('submission.csv', index=False)