# Project 4 – Predicting a Continuous Target with Regression (Titanic)  
Author: Beth Spornitz
Date: November 6, 2025

Introduction-------


In [None]:

import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear regression import LinearRegression, Ridge, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

pd.options.display.precision = 3


## Section 1: Import and Inspect the Data

Load the Titanic dataset and confirm it’s structured correctly.

In [None]:
# Load Titanic dataset from seaborn and verify
titanic = sns.load_dataset("titanic")
titanic.head()


## Section 2: Data Exploration and Preparation

- Impute missing values for age using median  
- Drop rows with missing fare  
- Create `family_size = sibsp + parch + 1`  
- Optional: convert categorical columns (sex, embarked) if you want them later


In [None]:
# Fill missing ages, drop missing fares, create family_size
titanic['age'].fillna(titanic['age'].median(), inplace=True)
titanic = titanic.dropna(subset=['fare'])
titanic['family_size'] = titanic['sibsp'] + titanic['parch'] + 1

# Optional encodings for later use in Case 4:
# titanic['sex_male'] = (titanic['sex'] == 'male').astype(int)
# titanic = pd.get_dummies(titanic, columns=['embarked'], prefix='embarked', drop_first=True)

titanic.head()


## Section 3: Feature Selection and Justification

### Case 1: age only


In [None]:
X1 = titanic[['age']]
y1 = titanic['fare']


### Case 2: family_size only


In [None]:
X2 = titanic[['family_size']]
y2 = titanic['fare']


### Case 3: age + family_size


In [None]:
X3 = titanic[['age', 'family_size']]
y3 = titanic['fare']


### Case 4 — My choice

Replace the ??? below with my chosen feature(s).  
**If using categorical variables, be sure to encode them in Section 2 first.**


In [None]:
# Case 4 (custom) — EDIT THIS
# Example:
# X4 = titanic[['age', 'family_size', 'sex_male']]
# y4 = titanic['fare']

# Placeholder:
# X4 = titanic[[???]]
# y4 = titanic['fare']


**Reflection Questions:**
- Why might these features affect fare?
- List all available features:
- Which features could improve predictions and why?
- How many variables are in your Case 4?
- Why did you choose those variables?


## Section 4: Train a Regression Model (Linear Regression)


### 4.1 Split the Data


In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=123)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=123)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=123)

# Case 4 — uncomment after defining X4, y4 in Section 3
# X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state=123)



### 4.2 Train and Evaluate Linear Regression Models (all 4 cases)

We'll use a more concise approach - create each model and immediately call the fit() method.


In [None]:
lr_model1 = LinearRegression().fit(X1_train, y1_train)
lr_model2 = LinearRegression().fit(X2_train, y2_train)
lr_model3 = LinearRegression().fit(X3_train, y3_train)

# Case 4 — uncomment after defining X4
# lr_model4 = LinearRegression().fit(X4_train, y4_train)

# Predictions
y_pred_train1 = lr_model1.predict(X1_train)
y_pred_test1 = lr_model1.predict(X1_test)

y_pred_train2 = lr_model2.predict(X2_train)
y_pred_test2 = lr_model2.predict(X2_test)

# TODO: repeat for case 3 and 4 ....
# y_pred_train3 = lr_model3.predict(X3_train)
# y_pred_test3 = lr_model3.predict(X3_test)

# y_pred_train4 = lr_model4.predict(X4_train)
# y_pred_test4 = lr_model4.predict(X4_test)

# Instructor wants y1_pred_train / y1_pred_test naming:
y1_pred_train = y_pred_train1
y1_pred_test = y_pred_test1


### 4.3 Report Performance


In [None]:
print("Case 1: Training R²:", r2_score(y1_train, y1_pred_train))
print("Case 1: Test R²:", r2_score(y1_test, y1_pred_test))
print("Case 1: Test RMSE:", mean_squared_error(y1_test, y1_pred_test, squared=False))
print("Case 1: Test MAE:", mean_absolute_error(y1_test, y1_pred_test))

# TODO: Repeat for Cases 2-4....


#### Section 4 Reflection Questions

Compare the train vs test results for each.

Did Case 1 overfit or underfit? Explain:  
Did Case 2 overfit or underfit? Explain:  
Did Case 3 overfit or underfit? Explain:  
Did Case 4 overfit or underfit? Explain:  

Adding Age  
Did adding age improve the model:  
Propose a possible explanation (consider how age might affect ticket price, and whether the data supports that):  

Worst  
Which case performed the worst:  
How do you know:  
Do you think adding more training data would improve it (and why/why not):  

Best  
Which case performed the best:  
How do you know:  
Do you think adding more training data would improve it (and why/why not):  


## Section 5. Compare Alternative Models

Choose the best case model from the four cases. Use that model to continue to explore additional continuous prediction models.


### 5.1 Ridge Regression (L2 penalty)


In [None]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X1_train, y1_train)
y_pred_ridge = ridge_model.predict(X1_test)


### 5.2 Elastic Net (L1 + L2 combined)


In [None]:
elastic_model = ElasticNet(alpha=0.3, l1_ratio=0.5)
elastic_model.fit(X1_train, y1_train)
y_pred_elastic = elastic_model.predict(X1_test)


### 5.3 Polynomial Regression


In [None]:
poly = PolynomialFeatures(degree=3)
X_train_poly = poly.fit_transform(X1_train)
X_test_poly = poly.transform(X1_test)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y1_train)
y_pred_poly = poly_model.predict(X1_test_poly)


### 5.4 Visualize Polynomial Cubic Fit (for 1 input feature)


In [None]:
plt.scatter(X1_test, y1_test, label='Actual')
plt.scatter(X1_test, y_pred_poly, label='Predicted (Poly)')
plt.legend()
plt.title("Polynomial Regression: Age vs Fare")
plt.show()


### 5.4 Reflections

What patterns does the cubic model seem to capture:  
Where does it perform well or poorly:  
Did the polynomial fit outperform linear regression:  
Where (on the graph or among which kinds of data points) does it fit best:  


### 5.4 Compare All Models


In [None]:
def report(name, y_true, y_pred):
    print(f"{name} R²: {r2_score(y_true, y_pred):.3f}")
    print(f"{name} RMSE: {mean_squared_error(y_true, y_pred, squared=False):.2f}")
    print(f"{name} MAE: {mean_absolute_error(y_true, y_pred):.2f}\n")

report("Linear", y1_test, y1_pred_test)
report("Ridge", y1_test, y_pred_ridge)
report("ElasticNet", y1_test, y_pred_elastic)
report("Polynomial", y1_test, y_pred_poly)


### 5.5 Visualize Higher Order Polynomial (for the same 1 input case)


In [None]:
degree = 5  # try 4–8
poly_hi = PolynomialFeatures(degree=degree)
X_train_poly_hi = poly_hi.fit_transform(X1_train)
X_test_poly_hi = poly_hi.transform(X1_test)

poly_model_hi = LinearRegression()
poly_model_hi.fit(X_train_poly_hi, y1_train)
y_pred_poly_hi = poly_model_hi.predict(X_test_poly_hi)

plt.scatter(X1_test, y1_test)
plt.scatter(X1_test, y_pred_poly_hi)
plt.title(f"Polynomial Regression (degree={degree})")
plt.show()


## Section 6. Final Thoughts & Insights

### 6.1 Summarize Findings
What features were most useful?  
What regression model performed best?  
How did model complexity or regularization affect results?  

### 6.2 Discuss Challenges
Was fare hard to predict? Why?  
Did skew or outliers impact the models?  

### 6.3 Optional Next Steps
Try different features besides the ones used  
Try predicting age instead of fare  
Explore log transformation of fare to reduce skew  
