In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import OneHotEncoder
from scipy.optimize import minimize

In [2]:
data = pd.read_csv('src/Fish.csv')

In [3]:
data.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [4]:
data.shape

(159, 7)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [6]:
data['Species'].unique()

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [7]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoder_df = pd.DataFrame(encoder.fit_transform(data[['Species']]).toarray())
encoder_df.columns = encoder.get_feature_names_out(['Species'])
data = data.join(encoder_df)
data.drop(labels=["Species"],axis=1,inplace=True)

In [8]:
data.head()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,Species_Bream,Species_Parkki,Species_Perch,Species_Pike,Species_Roach,Species_Smelt,Species_Whitefish
0,242.0,23.2,25.4,30.0,11.52,4.02,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,290.0,24.0,26.3,31.2,12.48,4.3056,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,340.0,23.9,26.5,31.1,12.3778,4.6961,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,363.0,26.3,29.0,33.5,12.73,4.4555,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,430.0,26.5,29.0,34.0,12.444,5.134,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Append two random columns to the DataFrame
np.random.seed(42)  # For reproducibility
data['Rand1'] = np.random.randint(10, 101, size=len(data))
data['Rand2'] = np.random.randint(1, 7, size=len(data))

In [10]:
data.head()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,Species_Bream,Species_Parkki,Species_Perch,Species_Pike,Species_Roach,Species_Smelt,Species_Whitefish,Rand1,Rand2
0,242.0,23.2,25.4,30.0,11.52,4.02,1.0,0.0,0.0,0.0,0.0,0.0,0.0,61,3
1,290.0,24.0,26.3,31.2,12.48,4.3056,1.0,0.0,0.0,0.0,0.0,0.0,0.0,24,3
2,340.0,23.9,26.5,31.1,12.3778,4.6961,1.0,0.0,0.0,0.0,0.0,0.0,0.0,81,6
3,363.0,26.3,29.0,33.5,12.73,4.4555,1.0,0.0,0.0,0.0,0.0,0.0,0.0,70,4
4,430.0,26.5,29.0,34.0,12.444,5.134,1.0,0.0,0.0,0.0,0.0,0.0,0.0,30,2


In [11]:
X = data.drop(columns=['Weight'])
y = data['Weight']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models_results = {}

In [24]:
print(f"""
Shape of data: {data.shape}

Shape of X_train: {X_train.shape}
Shape of y_train: {y_train.shape}

Shape of X_test: {X_test.shape}
Shape of y_test: {y_test.shape}
""")


Shape of data: (159, 15)

Shape of X_train: (127, 14)
Shape of y_train: (127,)

Shape of X_test: (32, 14)
Shape of y_test: (32,)



In [13]:
model_1 = LinearRegression()
model_1.fit(X_train, y_train)
models_results['Model-1'] = {
    'coefficients': model_1.coef_,
    'intercept': model_1.intercept_,
    'explained_variance': model_1.score(X_test, y_test)
}

In [14]:
X_train_model_2 = X_train.drop(columns=['Rand1'])
X_test_model_2 = X_test.drop(columns=['Rand1'])

model_2 = LinearRegression()
model_2.fit(X_train_model_2, y_train)
models_results['Model-2'] = {
    'coefficients': model_2.coef_,
    'intercept': model_2.intercept_,
    'explained_variance': model_2.score(X_test_model_2, y_test)
}

In [15]:
X_train_model_3 = X_train.drop(columns=['Rand1', 'Rand2'])
X_test_model_3 = X_test.drop(columns=['Rand1', 'Rand2'])

model_3 = LinearRegression()
model_3.fit(X_train_model_3, y_train)
models_results['Model-3'] = {
    'coefficients': model_3.coef_,
    'intercept': model_3.intercept_,
    'explained_variance': model_3.score(X_test_model_3, y_test)
}

In [16]:
X_train_model_4 = X_train.head(2)
y_train_model_4 = y_train.head(2)

X_test_model_4 = X_test.head(2)
y_test_model_4 = y_test.head(2)

model_4 = LinearRegression()
model_4.fit(X_train_model_4, y_train_model_4)
models_results['Model-4'] = {
    'coefficients': model_4.coef_,
    'intercept': model_4.intercept_,
    'explained_variance': model_4.score(X_test, y_test)
}

In [28]:
for model, result in models_results.items():
    print(f"{model}:")
    print(f" -> Coefficients: {result['coefficients']}")
    print(f" -> Intercept: {result['intercept']}")
    print(f" -> Explained Variance: {result['explained_variance']:.2f}\n")
    print("*"*100)

Model-1:
 -> Coefficients: [-5.48306368e+01  5.18904100e+01  3.80898196e+01 -9.75373508e+00
  1.00508537e+01 -4.84316795e+01  9.46721461e+01  3.04525091e+01
 -3.68657926e+02 -8.41693687e+00  2.87175214e+02  1.32066741e+01
  4.54323021e-02  7.68524801e+00]
 -> Intercept: -809.3219926080676
 -> Explained Variance: 0.95

****************************************************************************************************
Model-2:
 -> Coefficients: [ -55.3972675    51.76838126   38.69949975   -9.94864225   10.26649787
  -48.66324741   94.46328398   30.94717698 -368.52316345   -8.77247275
  286.85653635   13.69188629    7.64505521]
 -> Intercept: -806.6823581752342
 -> Explained Variance: 0.95

****************************************************************************************************
Model-3:
 -> Coefficients: [ -59.1395506    55.61529428   39.05378679  -10.73045193    7.22621264
  -42.31151092   97.25091484   32.35928897 -373.68699528   -7.3166755
  288.37496441    5.33001349]
 ->

In [18]:
# "Optimization" task: Implement regression optimizer using SciPy
def loss_function(beta, X, y):
    predictions = X.dot(beta)
    return np.mean((predictions - y) ** 2)

In [19]:
initial_beta = np.zeros(X.shape[1])
result = minimize(loss_function, initial_beta, args=(X_train.values, y_train.values))
optimized_beta = result.x
print("Optimized Coefficients:", optimized_beta)

Optimized Coefficients: [-5.48275285e+01  5.18885791e+01  3.80886045e+01 -9.75432975e+00
  1.00526577e+01 -8.57740854e+02 -7.14642996e+02 -7.78865705e+02
 -1.17797348e+03 -8.17733658e+02 -5.22144531e+02 -7.96108644e+02
  4.54229746e-02  7.68520857e+00]


In [20]:
# "Regularization" task: Implement Lasso with upper bound constraint
lasso_upper_bound = Lasso(alpha=1000)
lasso_upper_bound.fit(X_train, y_train)
print("Lasso with Upper Bound Coefficients:", lasso_upper_bound.coef_)

Lasso with Upper Bound Coefficients: [ 0.          0.         20.66327454  0.          0.          0.
 -0.          0.         -0.         -0.         -0.          0.
 -0.          0.        ]


In [21]:
# Lasso with Lagrange multiplier
lasso_penalty = Lasso(alpha=0.0001)
lasso_penalty.fit(X_train, y_train)
print("Lasso with Lagrange Multiplier Coefficients:", lasso_penalty.coef_)

Lasso with Lagrange Multiplier Coefficients: [-3.11822853e+00  2.48314961e+01  1.75528158e+01 -8.83586908e+00
  9.01141807e+00  3.45167016e+01  1.15427151e+02  3.39595221e+01
 -3.60985492e+02  1.27687004e+01  2.71740114e+02  4.60762217e+01
  1.09254317e-01  8.02897852e+00]


  model = cd_fast.enet_coordinate_descent(


In [22]:
# Compare the two Lasso versions
# Note: In this case, you might want to adjust alpha to find a working regularization scenario
# Display results
print("Comparison of Lasso versions:")
print("Upper Bound Coefficients:", lasso_upper_bound.coef_)
print("Lagrange Multiplier Coefficients:", lasso_penalty.coef_)

Comparison of Lasso versions:
Upper Bound Coefficients: [ 0.          0.         20.66327454  0.          0.          0.
 -0.          0.         -0.         -0.         -0.          0.
 -0.          0.        ]
Lagrange Multiplier Coefficients: [-3.11822853e+00  2.48314961e+01  1.75528158e+01 -8.83586908e+00
  9.01141807e+00  3.45167016e+01  1.15427151e+02  3.39595221e+01
 -3.60985492e+02  1.27687004e+01  2.71740114e+02  4.60762217e+01
  1.09254317e-01  8.02897852e+00]


In [23]:
# Using built-in Lasso from sklearn
lasso_builtin = Lasso(alpha=0.01)
lasso_builtin.fit(X_train, y_train)
print("Built-in Lasso Coefficients:", lasso_builtin.coef_)

Built-in Lasso Coefficients: [-3.12344500e+00  2.47970597e+01  1.75484128e+01 -8.65595981e+00
  8.91023186e+00 -4.02653407e+00  7.70005233e+01 -3.62437666e+00
 -3.97640502e+02 -2.49110443e+01  2.33983449e+02  7.88521215e+00
  1.09003286e-01  8.02223389e+00]


  model = cd_fast.enet_coordinate_descent(
