In [None]:
#Preliminary Steps

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("epa-sea-level.csv")
df = df[["Year","CSIRO Adjusted Sea Level"]]

Check data

In [None]:
df.head()

Preprocessing

In [None]:
df.dropna()

Check data

In [None]:
df.describe()

In [None]:
ax = df["CSIRO Adjusted Sea Level"].plot.hist(bins=15,figsize=(10,5),alpha=0.5,color='#1A4D3B');

Appear to have right skewed data, with a lot of variation. We don't seem to have any apparent outliers (seen in boxplot).

Checking for outliers

In [None]:
Q1 = df["CSIRO Adjusted Sea Level"].quantile(0.25)
Q3 = df["CSIRO Adjusted Sea Level"].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df["CSIRO Adjusted Sea Level"] < (Q1 - 1.5 * IQR)) | (df["CSIRO Adjusted Sea Level"] > (Q3 + 1.5 * IQR)))
df_outliers = df["CSIRO Adjusted Sea Level"][outliers]
print(df_outliers)

In [None]:
sns.catplot(y = "CSIRO Adjusted Sea Level", kind="box", data=df);


Don't appear to have any outliers, our outlier code gives us an empty list, and our box plot doesn't have any outliers outside of our IQR.

Data filtering for years greater than or equal to 2000 (I want to see how this plays out with a smaller dataset

In [None]:
df = df[df["Year"]>=2000]

Train/test split

In [None]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
train_df, test_df = train_test_split(df_shuffled, test_size=0.3, random_state=42)

X_train = train_df[["Year"]].values
y_train = train_df["CSIRO Adjusted Sea Level"].values
X_test = test_df[["Year"]].values
y_test = test_df["CSIRO Adjusted Sea Level"].values

Using LR and SVM

In [None]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

svm_model = SVR(kernel = 'linear')
svm_model.fit(X_train, y_train)

In [None]:
lin_pred_plot = lin_model.predict(X_train)
svm_pred_plot = svm_model.predict(X_train)

plt.figure(figsize=(10, 6))
plt.scatter(df["Year"], df["CSIRO Adjusted Sea Level"], color='lightblue', label="Observed Data")
plt.plot(X_train, lin_pred_plot, color='red', label="Linear Regression")
plt.plot(X_train, svm_pred_plot, color='green', linestyle='--', label="SVM Regression")
plt.xlabel("Year")
plt.ylabel("CSIRO Adjusted Sea Level (mm)")
plt.title("Sea Level Prediction (2000–Present): Linear vs SVM")
plt.legend()
plt.grid(True)
plt.tight_layout()

#This is for the train dataset, the test data set (the actual answer is below)

In [None]:
lin_pred_plot = lin_model.predict(X_test)
svm_pred_plot = svm_model.predict(X_test)

plt.figure(figsize=(10, 6))
plt.scatter(df["Year"], df["CSIRO Adjusted Sea Level"], color='lightblue', label="Observed Data")
plt.plot(X_test, lin_pred_plot, color='red', label="Linear Regression")
plt.plot(X_test, svm_pred_plot, color='green', linestyle='--', label="SVM Regression")
plt.xlabel("Year")
plt.ylabel("CSIRO Adjusted Sea Level (mm)")
plt.title("Sea Level Prediction (2000–Present): Linear vs SVM")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
print("Linear Regression Coefficients:")
print("Intercept", lin_model.intercept_)
print("Coefficient", lin_model.coef_)

print("SVM Coefficients:")
print("Intercept", svm_model.intercept_)
print("Slope", svm_model.coef_)

In [None]:
year_2040 = np.array([[2040]])
year_max = np.array([[X_test.max()]])
print(lin_model.predict(year_2040) - lin_model.predict(year_max)) 
print(svm_model.predict(year_2040) - svm_model.predict(year_max))

years of 2040 and 2030

In [None]:
print(lin_model.predict(year_2040))
print(svm_model.predict(year_2040))

In [None]:
year_2030 = np.array([[2030]])
print("2030 lin model", lin_model.predict(year_2030))
print("2030 svm model", svm_model.predict(year_2030))
print("2040 lin model", lin_model.predict(year_2040))
print("2040 svm model", svm_model.predict(year_2040))

Possible explanation: The differences in the model can be because the linear model tries its best to minimize the sum of squares error from all data points, while our svm model does its best to minimize errors outside of our margin lines. Meaning, our linear model includes errors from all points, while our svm model doesn't include errrors inside margin lines, and as a result, our results can be slightly different. 

Metrics

In [None]:
mse_lin = mean_squared_error(y_test, y_pred_lin)
mse_svm = mean_squared_error(y_test, y_pred_svm)
print("Linear MSE", mse_lin)
print("SVM MSE", mse_svm)

We see that SVM performed slightly worse compared to our linear model. 

Interpretation: The regression model takes our data points, and creates a model that minimizes the amount of error, which allows us to predict the next data point as accurately as possible. Depending on the coeffiecients we get out, we start to understand if our model is linear or not, how strong our relationship is, our baseline values, and most importantly, how much our output (in this case sea level) changes depending on a 1 unit increase (in this case 1 year increase) in our explanatory variable. Here, we see that a 1 year increase is associated with a rise in sea level of roughly 0.16 inches for both models. 