### Defining Regression Models - Linear, Lasso, Ridge

In [2]:
import numpy as np

class LinearRegression:
    def __init__(self, lr=0.01, n_iter=1000):
        """
        Initialize the Linear Regression model.
        
        Parameters:
        lr (float): Learning rate for gradient descent.
        n_iter (int): Number of iterations for gradient descent.
        """
        self.lr = lr
        self.n_iter = n_iter
        self.weights = None
        self.bias = None
        
    def fit(self, X, y):
        """
        Fit the Linear Regression model to the training data.
        
        Parameters:
        X (numpy.ndarray): Input features of shape (n_samples, n_features).
        y (numpy.ndarray): Target values of shape (n_samples,).
        """
        n_samples, n_features = X.shape
        
        # Parameter initialization
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.n_iter):
            # Make predictions
            y_pred = self.predict(X)
            
            # Compute gradients
            dW = np.dot(X.T, (y_pred - y)) / n_samples
            db = np.sum(y_pred - y) / n_samples
            
            # Update parameters using gradient descent
            self.weights -= self.lr * dW
            self.bias -= self.lr * db
        
    def predict(self, X):
        """
        Make predictions using the trained model.
        
        Parameters:
        X (numpy.ndarray): Input features of shape (n_samples, n_features).
        
        Returns:
        y_pred (numpy.ndarray): Predicted values of shape (n_samples,).
        """
        return np.dot(X, self.weights) + self.bias


In [3]:
import numpy as np

class RidgeRegression:
    def __init__(self, alpha=1, lr=0.01, n_iter=1000):
        """
        Initialize the Ridge Regression model.
        
        Parameters:
        alpha (float): Regularization strength (L2 penalty).
        lr (float): Learning rate for gradient descent.
        n_iter (int): Number of iterations for gradient descent.
        """
        self.alpha = alpha
        self.lr = lr
        self.n_iter = n_iter
        self.weights = None
        self.bias = None
        
    def fit(self, X, y):
        """
        Fit the Ridge Regression model to the training data.
        
        Parameters:
        X (numpy.ndarray): Input features of shape (n_samples, n_features).
        y (numpy.ndarray): Target values of shape (n_samples,).
        """
        n_samples, n_features = X.shape
        
        # Parameter initialization
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.n_iter):
            # Make predictions
            y_pred = self.predict(X)
            
            # Compute gradients with L2 regularization
            dW = (-(2 * np.dot(X.T, (y - y_pred))) + (2 * self.alpha * self.weights)) / n_samples
            db = -2 * np.sum(y_pred - y) / n_samples
            
            # Update parameters using gradient descent
            self.weights -= self.lr * dW
            self.bias -= self.lr * db
        
    def predict(self, X):
        """
        Make predictions using the trained model.
        
        Parameters:
        X (numpy.ndarray): Input features of shape (n_samples, n_features).
        
        Returns:
        y_pred (numpy.ndarray): Predicted values of shape (n_samples,).
        """
        return np.dot(X, self.weights) + self.bias


In [4]:
import numpy as np

class LassoRegression:
    def __init__(self, alpha=1, lr=0.01, n_iter=1000):
        """
        Initialize the Lasso Regression model.
        
        Parameters:
        alpha (float): Regularization strength (L1 penalty).
        lr (float): Learning rate for gradient descent.
        n_iter (int): Number of iterations for gradient descent.
        """
        self.alpha = alpha
        self.lr = lr
        self.n_iter = n_iter
        self.weights = None
        self.bias = None
        
    def fit(self, X, y):
        """
        Fit the Lasso Regression model to the training data.
        
        Parameters:
        X (numpy.ndarray): Input features of shape (n_samples, n_features).
        y (numpy.ndarray): Target values of shape (n_samples,).
        """
        n_samples, n_features = X.shape
        
        # Parameter initialization
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.n_iter):
            # Make predictions
            y_pred = self.predict(X)
            
            # Compute gradients with L1 regularization
            dW = (-(2 * np.dot(X.T, (y - y_pred))) + (self.alpha)) / n_samples
            db = -2 * np.sum(y_pred - y) / n_samples
            
            # Update parameters using gradient descent
            self.weights -= self.lr * dW
            self.bias -= self.lr * db
        
    def predict(self, X):
        """
        Make predictions using the trained model.
        
        Parameters:
        X (numpy.ndarray): Input features of shape (n_samples, n_features).
        
        Returns:
        y_pred (numpy.ndarray): Predicted values of shape (n_samples,).
        """
        return np.dot(X, self.weights) + self.bias


# Data Preprocessing

In [5]:
import pandas as pd

# Define the path to the CSV file
data_path = "./EPL_Soccer_MLR_LR.csv"

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(data_path)

# Print the shape (number of rows and columns) of the DataFrame
print("ACTUAL DF SHAPE : ", df.shape)


ACTUAL DF SHAPE :  (217, 13)


In [6]:
df.head()

Unnamed: 0,PlayerName,Club,DistanceCovered(InKms),Goals,MinutestoGoalRatio,ShotsPerGame,AgentCharges,BMI,Cost,PreviousClubCost,Height,Weight,Score
0,"Braund, Mr. Owen Harris",MUN,3.96,7.5,37.5,12.3,60.0,20.56,109.1,63.32,195.9,78.9,19.75
1,"Allen, Mr. William Henry",MUN,4.41,8.3,38.2,12.7,68.0,20.67,102.8,58.55,189.7,74.4,21.3
2,"Moran, Mr. James",MUN,4.14,5.0,36.4,11.6,21.0,21.86,104.6,55.36,177.8,69.1,19.88
3,"McCarthy, Mr. Timothy J",MUN,4.11,5.3,37.3,12.6,69.0,21.88,126.4,57.18,185.0,74.9,23.66
4,"Palsson, Master. Gosta Leonard",MUN,4.45,6.8,41.5,14.0,29.0,18.96,80.3,53.2,184.6,64.6,17.64


In [7]:
# Drop rows with all null values
df.dropna(axis=0, how='all', thresh=None, subset=None, inplace=True)

# Select only numeric columns (drop categorical columns)
new_df = df.select_dtypes(['number'])


In [8]:
# Select independent and dependent features
X = new_df.iloc[:, :-1]  # Independent features
y = new_df.iloc[:, -1]   # Dependent feature

# Print the original shape of X (independent features)
print("org shape of X : ", X.shape)

# Initialize a set to store correlated features
correlated_features = set()

# Compute the correlation matrix for the independent features
correlation_matrix = X.corr()


org shape of X :  (202, 10)


In [9]:
# Displaying Correlation Matrix

correlation_matrix


Unnamed: 0,DistanceCovered(InKms),Goals,MinutestoGoalRatio,ShotsPerGame,AgentCharges,BMI,Cost,PreviousClubCost,Height,Weight
DistanceCovered(InKms),1.0,0.147098,0.924964,0.8888,0.250865,0.299471,-0.403004,0.550975,0.358854,0.403743
Goals,0.147098,1.0,0.153333,0.134721,0.131973,0.177032,0.137131,0.102734,0.076958,0.155844
MinutestoGoalRatio,0.924964,0.153333,1.0,0.950757,0.25824,0.320527,-0.449135,0.583375,0.371192,0.423699
ShotsPerGame,0.8888,0.134721,0.950757,1.0,0.308391,0.382524,-0.435429,0.610986,0.352322,0.455255
AgentCharges,0.250865,0.131973,0.25824,0.308391,1.0,0.302556,-0.108243,0.317581,0.123255,0.273686
BMI,0.299471,0.177032,0.320527,0.382524,0.302556,1.0,0.321116,0.713858,0.337097,0.845955
Cost,-0.403004,0.137131,-0.449135,-0.435429,-0.108243,0.321116,1.0,-0.207749,-0.071253,0.154227
PreviousClubCost,0.550975,0.102734,0.583375,0.610986,0.317581,0.713858,-0.207749,1.0,0.802119,0.930904
Height,0.358854,0.076958,0.371192,0.352322,0.123255,0.337097,-0.071253,0.802119,1.0,0.780906
Weight,0.403743,0.155844,0.423699,0.455255,0.273686,0.845955,0.154227,0.930904,0.780906,1.0


In [10]:
# Correlated Features

# Initialize a set to store correlated features
correlated_features = set()

# Iterate through the correlation matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        # Check if the absolute correlation coefficient is greater than 0.8 (highly correlated)
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            # Get the name of the correlated feature
            colname = correlation_matrix.columns[i]
            # Add the name to the set of correlated features
            correlated_features.add(colname)

# Print the correlated features
print("Correlated Features : ", correlated_features)


Correlated Features :  {'Height', 'MinutestoGoalRatio', 'Weight', 'ShotsPerGame'}


In [None]:
# Drop correlated features from X
X.drop(columns=correlated_features, axis=1, inplace=True)

# Print the shape of X after dropping correlated features
print("Shape of X after dropping correlated features : ", X.shape)


In [12]:
# Implementing train_test_split function

def shuffle_data(X, y, seed=None):
    """ Random shuffle of the samples in X and y """
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    try:
        return X[idx], y[idx]
    except:
        return X.iloc[idx], y.iloc[idx]

def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
    """ Split the data into train and test sets """
    if shuffle:
        X, y = shuffle_data(X, y, seed)
    # Split the training data from test data in the ratio specified in test_size
    split_i = len(y) - int(len(y) // (1 / test_size))
    X_train, X_test = X[:split_i], X[split_i:]
    y_train, y_test = y[:split_i], y[split_i:]

    return X_train, X_test, y_train, y_test

# Splitting the data into training and testing sets with a test size of 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, seed=42)


# Model Creation

#### _*Linear Model*_

In [13]:
# Create an instance of the LinearRegression class with specified learning rate and number of iterations
linear_model = LinearRegression(lr=0.00001, n_iter=100)

# Fit the linear model on the training data
linear_model.fit(X_train, y_train)

# Use the trained model to make predictions on the test data
linear_predict = linear_model.predict(X_test)


#### _*Lasso Model*_

In [14]:
# Create an instance of the LassoRegression class with specified alpha (regularization strength), learning rate, and number of iterations
lasso_model = LassoRegression(alpha=0.03, lr=0.00001, n_iter=100)

# Fit the Lasso model on the training data
lasso_model.fit(X_train, y_train)

# Use the trained Lasso model to make predictions on the test data
lasso_predict = lasso_model.predict(X_test)


#### _*Ridge Model*_

In [15]:
# Create an instance of the RidgeRegression class with specified alpha (regularization strength), learning rate, and number of iterations
ridge_model = RidgeRegression(alpha=0.03, lr=0.00001, n_iter=100)

# Fit the Ridge model on the training data
ridge_model.fit(X_train, y_train)

# Use the trained Ridge model to make predictions on the test data
ridge_predict = ridge_model.predict(X_test)


# Metrics

#### _*MSE*_

In [16]:
def mean_squared_error(y_true, y_pred):
    # Calculate the squared differences between true and predicted values
    squared_errors = (y_true - y_pred) ** 2
    
    # Calculate the mean of the squared errors to get the MSE
    mse = np.mean(squared_errors)
    
    return mse


#### _*R2 Score*_

In [17]:
def r2_score(y_true, y_pred):
    # Calculate the correlation matrix between true and predicted values
    corr_matrix = np.corrcoef(y_true, y_pred)
    
    # Extract the correlation coefficient from the matrix
    corr = corr_matrix[0, 1]
    
    # Calculate R-squared
    r2 = corr ** 2
    
    return r2


In [18]:
print("MSE of Linear Model : ", mean_squared_error(y_test, linear_predict))
print("MSE of Lasso Model : ", mean_squared_error(y_test, lasso_predict))
print("MSE of Ridge Model : ", mean_squared_error(y_test, ridge_predict))

MSE of Linear Model :  4.288101722760345
MSE of Lasso Model :  2.849692797681988
MSE of Ridge Model :  2.8496948261295376


In [19]:
print("R2 Score of Linear Model : ", r2_score(y_test, linear_predict))
print("R2 Score of Lasso Model : ", r2_score(y_test, lasso_predict))
print("R2 Score of Ridge Model : ", r2_score(y_test, ridge_predict))

[[1.         0.96521229]
 [0.96521229 1.        ]]
R2 Score of Linear Model :  0.9316347693382256
[[1.         0.97366924]
 [0.97366924 1.        ]]
R2 Score of Lasso Model :  0.9480317900859903
[[1.         0.97366923]
 [0.97366923 1.        ]]
R2 Score of Ridge Model :  0.9480317788420329
