In [None]:
import polars as pl
import pandas as pd

In [None]:
df = pl.read_csv("./train.csv")

# Data Cleaning

In [None]:
df2 = df.with_columns(
    pl.col("Name").map_elements(lambda x: x.split(', ')[0]).alias('Last'),
    pl.col("Name").map_elements(lambda x: x.split(', ')[1].split('. ')[1]).alias('First'),
    pl.col("Name").map_elements(lambda x: x.split(', ')[1].split('. ')[0]).alias('Sfx'),
    pl.col("Name").map_elements(lambda x: x.split(', ')[1].split('. ')[1].split('(')[-1].split(')')[0] if '(' in x else None).alias('Second Name')
    )

In [None]:
df3 = df2.with_columns(
    pl.when(pl.col('Second Name').is_null()).then(0).otherwise(1).alias('Has Second Name'),
    pl.col('Age').map_elements(lambda x: 1 if x >= 18 else 0).alias('Adult')
)

In [None]:
df3

# Predict Age of Passengers

#### Useful Attributes to Predict Age
- Parch
- Sfx
- Has Second Name
- Fare
- Pclass

In [None]:
# Filter df to only ages
passengers_with_ages = df3.filter(~pl.col("Age").is_null())

In [None]:
passengers_with_ages_y = passengers_with_ages.select(['Age'])
passengers_with_ages_X_cols = ['Parch', 'Sfx', 'Has Second Name', 'Fare', 'Pclass']
passengers_with_ages_X = passengers_with_ages.select(passengers_with_ages_X_cols)

In [None]:
passengers_with_ages_X = passengers_with_ages_X.to_pandas()
passengers_with_ages_y = passengers_with_ages_y.to_pandas()

passengers_with_ages_X = pd.get_dummies(data=passengers_with_ages_X, drop_first=True)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import matplotlib.pyplot as plt

In [None]:
# Randomly sample 95% for training while the reset will be for testing.
ages_X_train, ages_X_test, ages_y_train, ages_y_test = train_test_split(passengers_with_ages_X, passengers_with_ages_y, test_size=0.1, random_state=42)

In [None]:
ages_X_test

In [None]:
reg = linear_model.LinearRegression()

In [None]:
reg.fit(X=ages_X_train, y=ages_y_train)

In [None]:
reg.coef_

In [None]:
ages_y_predict = reg.predict(ages_X_test)

In [None]:
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(ages_y_test, ages_y_predict))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(ages_y_test, ages_y_predict))

In [None]:
# Plot outputs
plt.scatter(ages_y_test, ages_y_predict, color="black")
# plt.plot(ages_X_test['Fare'], ages_y_predict, color="blue", linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()