## Chapter 5
Snippets for the chapter 5

Load data

In [None]:
import pandas as pd

df = pd.read_csv('data/longevity.csv')
df.shape

## The Baseline Model

Create density plot of 1,000 people’s life spans, facetted by smokers

In [None]:
%matplotlib inline
from plotnine import *

fig = (
    ggplot(data = df, mapping=aes(x = 'AgeAtDeath',  fill = 'factor(Smokes)')) +
    geom_density() +
    facet_grid(('Smokes', '.'))
)

fig

Calculate MSE using the mean of AgeAtDeath (73)

In [None]:
import numpy as np

guess = 73
np.square(df['AgeAtDeath'] - guess).mean()

Calculate MSE for another values

In [None]:
df_accurancy = pd.DataFrame({}, columns = ['Guess', 'Error'])

for age in range(63, 83):
    error = np.square(df['AgeAtDeath'] - age).mean()
    df_accurancy = df_accurancy.append({'Guess': age, 'Error': error}, ignore_index=True)
    
fig = (
    ggplot(data = df_accurancy, mapping=aes(x = 'Guess',  y = 'Error')) +
    geom_point() +
    geom_line()
)

fig

## Regression Using Dummy Variables

Calculate RMSE with and without smoking information

In [None]:
mean = df['AgeAtDeath'].mean()
smokes_mean = df[df['Smokes'] == 1]['AgeAtDeath'].mean()
no_smokes_mean = df[df['Smokes'] == 0]['AgeAtDeath'].mean()

def getMean(is_smoke):
    if is_smoke:
        return smokes_mean
    return no_smokes_mean
        

rmse_without_smoking  = np.sqrt(np.square(df['AgeAtDeath'] - guess).mean())
df['mean'] = df.apply(lambda row: getMean(row.Smokes), axis = 1) 
rmse_with_smoking  = np.sqrt(np.square(df['AgeAtDeath'] - df['mean']).mean())

print ('Error without smoking information', rmse_without_smoking)
print ('Error with smoking information', rmse_with_smoking)

## Linear Regression in a Nutshell

In [None]:
df = pd.read_csv('data/01_heights_weights_genders.csv')
df.head()

Weights versus heights

In [None]:
fig = (
    ggplot(data=df, mapping=aes(x = 'Height', y = 'Weight')) +
    geom_point() +
    geom_smooth(colour='blue')
)

fig

Print linear regression model details

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(df[['Height']], df['Weight'])

print (model.intercept_, model.coef_)


Calculate RMSE for linear regression model

In [None]:
predicted_values = model.predict(df[['Height']])
np.sqrt(np.square(predicted_values - df['Weight']).mean())


## To be continue