In [1]:
# Python ≥3.11 is required
import sys
assert sys.version_info >= (3,11)

In [5]:
# Scikit-Learn ≥1.6.1 is required
import sklearn
assert sklearn.__version__ >= "1.6.1"

In [16]:
import urllib.request
import os

datapath = os.path.join("datasets", "lifesat", "")
datapath
os.makedirs(datapath, exist_ok=True)

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"

for filename in ("oecd_bli_2015.csv", "gdp_per_capita.csv"):
    print("Downloading", filename)
    url = DOWNLOAD_ROOT + "datasets/lifesat/" + filename
    urllib.request.urlretrieve(url, datapath + filename)

Downloading oecd_bli_2015.csv
Downloading gdp_per_capita.csv


🔹 thousands=','
This tells Pandas how to interpret numbers containing thousands separators, such as:
"34,567" → 34567
Without specifying this, Pandas might treat "34,567" as a string rather than a numeric value. Setting thousands=',' ensures such values are parsed as integers or floats.

🔹 delimiter='\t'
Specifies the character used to separate columns in the file.\t is a tab character, indicating the file is tab-separated, not comma-separated, even though the extension is .csv.

🔹 encoding='latin1'
Specifies the text encoding of the file.'latin1', also called ISO-8859-1, is a common character set used in Western Europe. Useful when the file contains special characters like accented letters: é, ñ, ü, etc. Using encoding='latin1' prevents Unicode decoding errors (which you might get with default utf-8) when reading such files.

🔹 na_values="n/a"
Tells Pandas what strings to interpret as missing values (NaN). If the file has cells containing "n/a", they will be automatically converted into NaN (Not a Number) in the resulting DataFrame.

In [None]:
# Code example
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# import linear regression
import sklearn.linear_model

# Load the data
oecd_bli = pd.read_csv(datapath + "oecd_bli_2015.csv", thousands=',')
gdp_per_capita = pd.read_csv(datapath + "gdp_per_capita.csv",thousands=',',delimiter='\t',
                             encoding='latin1', na_values="n/a")

# Prepare the data
country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)
X = np.c_[country_stats["GDP per capita"]]
y = np.c_[country_stats["Life satisfaction"]]

# Visualize the data
country_stats.plot(kind='scatter', x="GDP per capita", y='Life satisfaction')
plt.show()

# Select a linear model
model = sklearn.linear_model.LinearRegression()

# Train the model
model.fit(X, y)

# Make a prediction for Cyprus
X_new = [[22587]]  # Cyprus' GDP per capita
print(model.predict(X_new)) # outputs [[ 5.96242338]]

NameError: name 'prepare_country_stats' is not defined