In [14]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fish-market/Fish.csv


In [15]:
# Load the dataset
file_path = "/kaggle/input/fish-market/Fish.csv"
df = pd.read_csv(file_path)

# Display basic info to confirm successful loading
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


(None,
   Species  Weight  Length1  Length2  Length3   Height   Width
 0   Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
 1   Bream   290.0     24.0     26.3     31.2  12.4800  4.3056
 2   Bream   340.0     23.9     26.5     31.1  12.3778  4.6961
 3   Bream   363.0     26.3     29.0     33.5  12.7300  4.4555
 4   Bream   430.0     26.5     29.0     34.0  12.4440  5.1340)

In [16]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm
import numpy as np

# Define independent and dependent variables
X = df[['Length1', 'Length2', 'Length3', 'Height', 'Width']]
y = df['Weight']

# Transform features into polynomial terms (degree=2 for this analysis)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Keep track of model performance
results = {}

# Helper function to calculate adjusted R²
def adjusted_r2(r2, n, k):
    return 1 - ((1 - r2) * (n - 1) / (n - k - 1))

# 1. Keep All Variables (Full Model)
lin_reg = LinearRegression()
lin_reg.fit(X_poly, y)
y_pred = lin_reg.predict(X_poly)
r2_full = r2_score(y, y_pred)
adj_r2_full = adjusted_r2(r2_full, len(y), X_poly.shape[1])
results['Keep All Variables'] = (r2_full, adj_r2_full)

# 2. Backward Elimination
X_poly_with_const = sm.add_constant(X_poly)
model = sm.OLS(y, X_poly_with_const).fit()
p_values = model.pvalues[1:]  # Exclude intercept
selected_features = np.where(p_values < 0.05)[0]

X_selected = X_poly[:, selected_features]  # Keep only significant features
lin_reg.fit(X_selected, y)
y_pred = lin_reg.predict(X_selected)
r2_backward = r2_score(y, y_pred)
adj_r2_backward = adjusted_r2(r2_backward, len(y), X_selected.shape[1])
results['Backward Elimination'] = (r2_backward, adj_r2_backward)

# 3. Forward Selection
remaining_features = []
best_r2 = -np.inf

for i in range(X_poly.shape[1]):
    temp_features = remaining_features + [i]
    X_temp = X_poly[:, temp_features]
    lin_reg.fit(X_temp, y)
    y_pred = lin_reg.predict(X_temp)
    r2_temp = r2_score(y, y_pred)
    
    if r2_temp > best_r2:
        best_r2 = r2_temp
        remaining_features.append(i)

X_forward = X_poly[:, remaining_features]
lin_reg.fit(X_forward, y)
y_pred = lin_reg.predict(X_forward)
r2_forward = r2_score(y, y_pred)
adj_r2_forward = adjusted_r2(r2_forward, len(y), X_forward.shape[1])
results['Forward Selection'] = (r2_forward, adj_r2_forward)

# 4. Bidirectional Selection (Combining Forward & Backward)
selected_features = []
best_r2 = -np.inf

for _ in range(X_poly.shape[1]):
    # Forward Step
    remaining_features = [i for i in range(X_poly.shape[1]) if i not in selected_features]
    best_feature = None
    
    for feature in remaining_features:
        temp_features = selected_features + [feature]
        X_temp = X_poly[:, temp_features]
        lin_reg.fit(X_temp, y)
        y_pred = lin_reg.predict(X_temp)
        r2_temp = r2_score(y, y_pred)
        
        if r2_temp > best_r2:
            best_r2 = r2_temp
            best_feature = feature

    if best_feature is not None:
        selected_features.append(best_feature)

    # Backward Step
    X_temp = X_poly[:, selected_features]
    lin_reg.fit(X_temp, y)
    model = sm.OLS(y, sm.add_constant(X_temp)).fit()
    p_values = model.pvalues[1:]

    for i, p_val in enumerate(p_values):
        if p_val > 0.05:
            selected_features.pop(i)

X_bidirectional = X_poly[:, selected_features]
lin_reg.fit(X_bidirectional, y)
y_pred = lin_reg.predict(X_bidirectional)
r2_bidirectional = r2_score(y, y_pred)
adj_r2_bidirectional = adjusted_r2(r2_bidirectional, len(y), X_bidirectional.shape[1])
results['Bidirectional Selection'] = (r2_bidirectional, adj_r2_bidirectional)

# Show results
results


{'Keep All Variables': (0.9832188512811139, 0.9807868007421449),
 'Backward Elimination': (0.8256300958984538, 0.8233945843074084),
 'Forward Selection': (0.9832188512811129, 0.9807868007421437),
 'Bidirectional Selection': (0.9722423295067167, 0.9717050842713628)}