In [11]:
# Loading the fish dataset

import pandas as pd

url = "https://raw.githubusercontent.com/WHPAN0108/BHT-DataScience-S24/main/regression/data/Fish.csv"
df = pd.read_csv(url)
pd.set_option('display.max_rows', None)
df

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134
5,Bream,450.0,26.8,29.7,34.7,13.6024,4.9274
6,Bream,500.0,26.8,29.7,34.5,14.1795,5.2785
7,Bream,390.0,27.6,30.0,35.0,12.67,4.69
8,Bream,450.0,27.6,30.0,35.1,14.0049,4.8438
9,Bream,500.0,28.5,30.7,36.2,14.2266,4.9594


In [13]:
# Clean the dataset if necessary before analysis.

import pandas as pd

# Checking for missing values
print("\nNumber of missing values per column:")
print(df.isnull().sum())

# Checking for negative values
# Here we check all numeric columns for negative values
numeric_columns = df.select_dtypes(include=['float64']).columns
negative_values = (df[numeric_columns] < 0).sum()
print("\nNumber of negative values per column:")
print(negative_values)

# Checking for duplicate entries
print("\nNumber of duplicate entries:")
print(df.duplicated().sum())

# Besides the checks above, all names inside the "Species" column seem to be written correctly. The only other noticeable feature that comes to mind, is that the columns "Height" and "Width" feature more decimal places than the other float columns.
# But since all values inside said columns are adjusted to have the same number of decimal places inside the Jupyter Notebook, I see no need to adjust them any further. I don't want to round them any further, since I want the prediction to be as precise as possible.


Number of missing values per column:
Species    0
Weight     0
Length1    0
Length2    0
Length3    0
Height     0
Width      0
dtype: int64

Number of negative values per column:
Weight     0
Length1    0
Length2    0
Length3    0
Height     0
Width      0
dtype: int64

Number of duplicate entries:
0


In [14]:
# Task 1
# 1. Split the dataset randomly into training (70%) and testing (30%) sets.
# 2. Apply the following models: - Linear Regression, - Random Forest
# 3. Calculate RMSE (Root Mean Squared Error) and R2 (Coefficient of Determination) on the test set.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Convert categorical variable 'Species' to dummy/indicator variables
df = pd.get_dummies(df, columns=['Species'], drop_first=True)

# Split the dataset into features and target variable
X = df.drop('Weight', axis=1)
y = df['Weight']

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Linear Regression
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)  # Calculate RMSE
r2_lr = r2_score(y_test, y_pred_lr)
print("Linear Regression Model")
print("Root Mean Squared Error:", rmse_lr)
print("R^2 Score:", r2_lr)

# Evaluate Random Forest
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)  # Calculate RMSE
r2_rf = r2_score(y_test, y_pred_rf)
print("\nRandom Forest Model")
print("Root Mean Squared Error:", rmse_rf)
print("R^2 Score:", r2_rf)

Linear Regression Model
Root Mean Squared Error: 87.15263646317108
R^2 Score: 0.9379921317795368

Random Forest Model
Root Mean Squared Error: 59.114636438879735
R^2 Score: 0.9714717068999104


In [15]:
# 4. Visualize the predictions by plotting y_pred vs y_test and compare the performance of the models.

import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

# Plotting Linear Regression predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lr, color='blue', label='Linear Regression')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='gray')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('Linear Regression: True Values vs. Predictions')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Plotting Random Forest predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, color='green', label='Random Forest')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='gray')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('Random Forest: True Values vs. Predictions')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Note: The plots for Task 1 can be found as JPGs in the same GitHub repository as this ipynb-file.
# Comparison: The Random Forest model does not only have a lower RMSE, but also a higher R^2 Score and it shows in the plot. The values in the Random Forest graph are closer to the diagonal line, than the values in the Linear Regression graph.
# Thus I conculde, that the Random Forest model performed better in this case, concerning the accuracy of the prediction.

In [None]:
# 5. Provide your opinion on which metric, RMSE or R^2, is more appropriate in this case.

# Answer: For prediction accuracy RMSE is more appropriate because it directly measures the average prediction error in units that are meaningful (grams).
# Lower RMSE indicates better predictive accuracy, which is crucial if precise prediction of fish weight is the primary goal.

# For Model Fit and Explanation R^2 is valuable as it assesses how well the model fits the data and explains the variance in fish weight.
# It helps in understanding the model's overall performance in capturing the relationships between predictors and the target variable.

# In summary, while both metrics provide valuable insights, RMSE is more directly relevant for assessing prediction accuracy in terms of absolute error,
# which is often critical in practical applications like this one, where the task is to predict fish weight based on various features.

In [16]:
# Task 2
# 1. Change the training-test split to ensure that each species has 70% of its samples in the training set and 30% in the test set.
# 2. Apply the following models: - Linear Regression, - Random Forest
# 3. Calculate RMSE (Root Mean Squared Error) and R2 (Coefficient of Determination) on the test set.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Reload the original DataFrame
url = "https://raw.githubusercontent.com/WHPAN0108/BHT-DataScience-S24/main/regression/data/Fish.csv"
df = pd.read_csv(url)
pd.set_option('display.max_rows', None)

# Store the 'Species' column for stratification
species_column = df['Species']

# Convert categorical variable 'Species' to dummy/indicator variables
df = pd.get_dummies(df, columns=['Species'], drop_first=True)

# Split the dataset into features and target variable
X = df.drop('Weight', axis=1)
y = df['Weight']

# Ensure each species has 70% of its samples in the training set and 30% in the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=species_column, random_state=42)

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Linear Regression
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)  # Calculate RMSE
r2_lr = r2_score(y_test, y_pred_lr)
print("Linear Regression Model")
print("Root Mean Squared Error:", rmse_lr)
print("R^2 Score:", r2_lr)

# Evaluate Random Forest
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)  # Calculate RMSE
r2_rf = r2_score(y_test, y_pred_rf)
print("\nRandom Forest Model")
print("Root Mean Squared Error:", rmse_rf)
print("R^2 Score:", r2_rf)

Linear Regression Model
Root Mean Squared Error: 99.1959087197982
R^2 Score: 0.9381668183544444

Random Forest Model
Root Mean Squared Error: 75.4860398306369
R^2 Score: 0.9641930477558176


In [17]:
# 4. Visualize the predictions by plotting y_pred vs y_test and compare the performance of the models.

import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

# Visualize Linear Regression predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lr, color='blue', label='Linear Regression')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='gray')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('Linear Regression: True Values vs. Predictions')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Visualize Random Forest predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, color='green', label='Random Forest')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='gray')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('Random Forest: True Values vs. Predictions')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Note: The plots for Task 2 can be found as JPGs in the same GitHub repository as this ipynb-file.
# Comparison: Just like in Task 1 before, the Random Forest model does again not only have a lower RMSE, but also a higher R^2 Score and it shows in the plot, though not as apparently as in Task 1.
# The values in the Random Forest graph are a bit closer to the diagonal line, than the values in the Linear Regression graph, for the most part.
# Thus I conculde, that the Random Forest model again performed better concerning the accuracy of the prediction.

In [None]:
# Comparison
# Compare the results obtained from Task 1 and Task 2.

# Answer: The comparison between Task 1 and Task 2 reveals noticeable differences in the RMSE values for both Linear Regression and Random Forest models.
# Task 2, which enforced a specific split ensuring 70% of each fish species in the training set, resulted in higher RMSE values compared to Task 1's random split.
# This suggests that the random split in Task 1 may have been more effective in achieving lower prediction errors on the test set.
# However, the R^2 scores remained relatively consistent between the two tasks, indicating similar explanatory power of the models, despite the different training-test splits.
# Thus, while the specific split in Task 2 altered prediction accuracy, the impact on overall model performance, as measured by R^2, was less pronounced.

In [18]:
# Extra Points
# Point out which parameters can be adjusted in this exercise to improve model performance. (dont need to run analysis again)

# Answer: Though I may not have adjusted any specific parameters for this exercise, I did some research, to find out, what parameters could be fine-tuned, to improve model performance and this is what I found out:

# Exploring different sets of hyperparameters for both the Linear Regression and Random Forest models may result in better model performance. For example:
# For Random Forest: Adjusting the number of trees (n_estimators), the maximum depth of trees (max_depth), and the minimum number of samples required to split an internal node (min_samples_split).
# For Linear Regression: Considering regularization techniques (e.g. Ridge or Lasso regression) by adjusting the regularization strength (alpha).

# I also found out, that implementing cross-validation techniques may not only result in a better estimation of the model performance, but also help to prevent overfitting.
# This involves further splitting the data into multiple subsets, like I already did with Task 1 and Task 2 and training the model on different combinations of these subsets, to find out, which subset results in the best model performance.