## Python statistics essential training - 05_02_fitmodel

Standard imports

In [None]:
import math

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib
import matplotlib.pyplot as pp

In [None]:
%matplotlib inline

In [None]:
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

Loading gapminder data for year 1985 (Live Aid!) and setting up plot as in chapter 3

In [None]:
gapminder = pd.read_csv('gapminder.csv')

In [None]:
gdata = gapminder.query('year == 1985')

In [None]:
size = 1e-6 * gdata.population

colors = gdata.region.map({'Africa': 'skyblue', 'Europe': 'gold', 'America': 'palegreen', 'Asia': 'coral'})

def plotdata():
    gdata.plot.scatter('age5_surviving','babies_per_woman',
                       c=colors,s=size,linewidths=0.5,edgecolor='k',alpha=0.5)

In [None]:
plotdata()

In [None]:
model = smf.ols(formula='babies_per_woman ~ 1',data=gdata)

In [None]:
grandmean = model.fit()

In [None]:
grandmean

In [None]:
def plotfit(fit):
    plotdata()
    pp.scatter(gdata.age5_surviving,fit.predict(gdata),
               c=colors,s=30,linewidths=0.5,edgecolor='k',marker='D')

In [None]:
plotfit(grandmean)

In [None]:
grandmean.params

In [None]:
gdata.babies_per_woman.mean()

In [None]:
groupmeans = smf.ols(formula='babies_per_woman ~ 1 + region',data=gdata).fit()

In [None]:
plotfit(groupmeans)

In [None]:
groupmeans.params

In [None]:
groupmeans = smf.ols(formula='babies_per_woman ~ -1 + region',data=gdata).fit()

In [None]:
groupmeans.params

In [None]:
gdata.groupby('region').babies_per_woman.mean()

In [None]:
surviving = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving',data=gdata).fit()

In [None]:
plotfit(surviving)

In [None]:
surviving.params

In [None]:
surviving_byregion = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving:region',data=gdata).fit()

In [None]:
plotfit(surviving_byregion)

In [None]:
surviving_byregion.params

In [None]:
surviving_byregion_population = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving:region + population',data=gdata).fit()

In [None]:
plotfit(surviving_byregion_population)