<a href="https://colab.research.google.com/github/DPS-2912/C/blob/main/Assignment4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# QUESTION-1

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

class RidgeRegressionGD:
    def __init__(self, learning_rate=0.01, n_iterations=1000, reg_param=0.1):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.reg_param = reg_param
        self.theta = None
        self.bias = 0

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.theta = np.zeros(n_features)

        for _ in range(self.n_iterations):
            y_pred = np.dot(X, self.theta) + self.bias
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y)) + self.reg_param * self.theta
            db = (1 / n_samples) * np.sum(y_pred - y)

            self.theta -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.theta) + self.bias

    def cost_function(self, y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2) + self.reg_param * np.sum(self.theta ** 2)

np.random.seed(42)
X_train = np.random.rand(100, 7)
y_train = np.random.rand(100)
X_test = np.random.rand(20, 7)
y_test = np.random.rand(20)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

learning_rates = [0.0001, 0.001, 0.01, 0.1]
reg_params = [1e-5, 1e-3, 0, 1]

best_cost = float('inf')
best_r2 = float('-inf')
best_lr = None
best_reg = None

for lr in learning_rates:
    for reg in reg_params:
        ridge_gd = RidgeRegressionGD(learning_rate=lr, n_iterations=1000, reg_param=reg)
        ridge_gd.fit(X_train_scaled, y_train)

        y_pred_train = ridge_gd.predict(X_train_scaled)
        y_pred_test = ridge_gd.predict(X_test_scaled)

        cost = ridge_gd.cost_function(y_train, y_pred_train)
        r2 = r2_score(y_test, y_pred_test)

        if cost < best_cost and r2 > best_r2:
            best_cost = cost
            best_r2 = r2
            best_lr = lr
            best_reg = reg

print(f"Best Learning Rate: {best_lr}")
print(f"Best Regularization Parameter: {best_reg}")
print(f"Minimum Cost: {best_cost}")
print(f"Maximum R2 Score: {best_r2}")

Best Learning Rate: 0.01
Best Regularization Parameter: 1e-05
Minimum Cost: 0.06931256649186872
Maximum R2 Score: 0.0181983828641622


In [None]:
#QUESTION-2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("https://drive.google.com/uc?export=download&id=1q0FKlrfHmsXqhUrDYh5K3P26biXEUcTs")

data = data.dropna(subset=['Salary'])
imputer = SimpleImputer(strategy='mean')
data[['Salary']] = imputer.fit_transform(data[['Salary']])

categorical_features = ['League', 'Division', 'NewLeague']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')
data_transformed = transformer.fit_transform(data)

data_transformed = pd.DataFrame(data_transformed, columns=transformer.get_feature_names_out())

salary_column = 'remainder__Salary'

# Separate input and output features
X = data_transformed.drop(salary_column, axis=1)
y = data_transformed[salary_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Fit a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Fit a Ridge Regression model
ridge_model = Ridge(alpha=0.5)
ridge_model.fit(X_train, y_train)

# Fit a LASSO Regression model
lasso_model = Lasso(alpha=0.5,max_iter=10000)
lasso_model.fit(X_train, y_train)


# Predict and evaluate Linear Regression model
y_pred_linear = linear_model.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred_linear)
print(f'Linear Regression MSE: {mse_linear}')

# Predict and evaluate Ridge Regression model
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f'Ridge Regression MSE: {mse_ridge}')

# Predict and evaluate LASSO Regression model
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print(f'LASSO Regression MSE: {mse_lasso}')

# Determine which model performs the best
best_model = min((mse_linear, 'Linear'), (mse_ridge, 'Ridge'), (mse_lasso, 'LASSO'))
print(f'The best model is {best_model[1]} with an MSE of {best_model[0]}')


Linear Regression MSE: 128284.34549672344
Ridge Regression MSE: 126665.75189802826
LASSO Regression MSE: 126820.57587652761
The best model is Ridge with an MSE of 126665.75189802826


In [None]:
import pandas as pd
import numpy as np

# Load dataset
url = "https://drive.google.com/uc?export=download&id=16WWIwbnFZjahcCKNDkMPBq1gvZDV_lEY"
data = pd.read_csv(url)

# Check for null values
print(data.isnull().sum())

# Handling null values (drop rows with nulls for simplicity)
data.dropna(inplace=True)

# Convert categorical variables to numerical using one-hot encoding
data = pd.get_dummies(data, drop_first=True)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features and target
X = data.drop('Salary', axis=1)  # Assuming 'Salary' is the target
y = data['Salary']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from sklearn.linear_model import LinearRegression, Ridge

# Initialize models
linear_model = LinearRegression()
ridge_model = Ridge(alpha=0.1248 ,max_iter = 200 , tol = 1e-4 )


# Fit the models
linear_model.fit(X_train_scaled, y_train)
ridge_model.fit(X_train_scaled, y_train)


from sklearn.linear_model import LinearRegression, Lasso

# Initialize models
linear_model = LinearRegression()
lasso_model = Lasso(alpha=0.1248 , max_iter = 200 , tol = 1e-4)


# Fit the models
linear_model.fit(X_train_scaled, y_train)
lasso_model.fit(X_train_scaled, y_train)

from sklearn.metrics import mean_squared_error, r2_score

# Function to evaluate model
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)
    return rmse, r2

# Evaluate models
linear_rmse, linear_r2 = evaluate_model(linear_model, X_test_scaled, y_test)
ridge_rmse, ridge_r2 = evaluate_model(ridge_model, X_test_scaled, y_test)
lasso_rmse, lasso_r2 = evaluate_model(lasso_model, X_test_scaled, y_test)

# Print results
print("Linear Regression RMSE:", linear_rmse, "R²:", linear_r2)
print("Ridge Regression RMSE:", ridge_rmse, "R²:", ridge_r2)
print("LASSO Regression RMSE:", lasso_rmse, "R²:", lasso_r2)

AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64
Linear Regression RMSE: 358.1680408645131 R²: 0.290745185579814
Ridge Regression RMSE: 357.10230176355935 R²: 0.2949597213851176
LASSO Regression RMSE: 355.2648892609774 R²: 0.30219639867303383


  model = cd_fast.enet_coordinate_descent(
