# **Linear Regression**

**Splitting Dataset**

In [None]:
import pandas as pd

df = pd.read_csv('slice_localization_data.csv')

df_filled = df.fillna(df.mode().iloc[0])

In [None]:
#dropping off unnecessary variable ‘patientId’, separating features and target variables.

df_copy = df_filled.drop(['patientId'], axis=1)
df_y = df_copy['reference']
df_x = df_copy.drop(['reference'], axis=1)

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=42)

# View the split data
print("X_train:")
print(X_train.head())
print("\nX_test:")
print(X_test.head())
print("\ny_train:")
print(y_train.head())
print("\ny_test:")
print(y_test.head())

**Apply Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train, y_train)

**Accuracy**

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

#for test dataset

y_pred = lm.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)

In [None]:
#for training dataset

y_pred = lm.predict(X_train)

mse = mean_squared_error(y_train, y_pred)
mae = mean_absolute_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)

In [None]:
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
# Define the function to plot learning curves
def plotLearningCurves(X, y, step):
    m, n = X.shape
    maxVal = (int)(m / 10) * 10
    N_size_arr = np.arange(10, maxVal + 10, step)
    error_arr = np.zeros((len(N_size_arr), 2))  # Updated line
    index = 0

    # Fitting Model
    lm.fit(X, y)

    # Increasing train dataset size, "step" times in each iteration
    for i in N_size_arr:
        # Splitting Training dataset with size i into train and cross-validation sets
        X_train_subset = X_train[:i]
        y_train_subset = y_train[:i]

        # Computing both mean squared error of the training dataset and cross-validation datasets predictions
        error_arr[index, 0] = mean_squared_error(y_train_subset, lm.predict(X_train_subset))
        error_arr[index, 1] = mean_squared_error(y_test, lm.predict(X_test))

        # Increasing index by 1
        index += 1

    # Initializing the figure
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_axes([0, 0, 1, 1])
    ax.set_yscale('log')

    # Plotting "Training set size" vs. "Mean Squared Error" for both the training and cross-validation dataset's errors
    line1, = ax.plot(N_size_arr, error_arr[:, 0], c='red')
    line2, = ax.plot(N_size_arr, error_arr[:, 1], c='blue')

    # Adding labels and legends to our plot
    ax.set_xlabel("N (Training set size)")
    ax.set_ylabel("Mean Squared Error")

    ax.legend((line1, line2), ("Train Error", "Test Error"))

# Call the function to plot the learning curves
plotLearningCurves(X_train, y_train, 200)

In [None]:
# Predecting Reference values with the test dataset
y_pred = lm.predict(X_test)

# Plotting predictions vs. y_test
fig = plt.figure(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])

ax.set_xlabel("Predictions")
ax.set_ylabel("Test Target Variable")
ax.plot(y_test, y_pred, 'bo', ms=1)

# Display the plot
plt.show()

Based on the above information, it appears that the model might be slightly overfitting. Overfitting occurs when a model learns the training data too well and performs poorly on unseen data.

The mean squared error (MSE) and mean absolute error (MAE) on the training dataset are slightly lower than on the test dataset. Additionally, the R-squared value on the training dataset is higher than on the test dataset.

High complexity model: Overfitting can occur when the model is too complex relative to the available data. With a large number of features (386) compared to the number of instances (53500), it's possible that the model has learned noise or irrelevant patterns in the training data.