In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.multivariate.manova import MANOVA
from sklearn.preprocessing import StandardScaler

plt.rcParams["figure.figsize"] = (10,8) 

In [9]:
df = pd.read_pickle("../datos/possum_standar.pkl")

In [10]:
df.sample(10)

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
97,101,7,other,m,-1.480724,-1.179776,-0.732164,82.5,0.994189,-0.910379,-0.033083,-0.997253,-1.026689,0.133146
62,66,5,other,f,-0.431228,-0.665674,-0.375863,86.0,-0.280089,-1.183628,-0.973631,0.616586,-0.280678,0.501578
96,100,7,other,m,-1.480724,-0.922725,-0.311082,81.5,-0.280089,-0.546047,-0.330098,-0.237799,-2.021371,-2.077444
33,34,2,Vic,m,-0.955976,-0.608552,-0.408254,85.5,-0.280089,1.070676,1.22923,-0.617526,-0.529348,-1.524796
16,17,1,Vic,f,-1.480724,0.562458,3.478661,89.5,-0.280089,1.093447,1.253981,-0.332731,0.962675,-0.603717
43,47,3,other,m,-0.955976,-0.751358,-0.699773,89.0,0.229622,-0.546047,-0.651864,-0.047936,-1.026689,0.133146
70,74,6,other,f,0.093519,-1.151215,-1.60672,83.0,0.484478,-1.570731,-0.552859,-0.332731,-0.529348,0.501578
94,98,7,other,m,0.618267,0.13404,3.77018,84.0,-1.044655,-0.63713,-0.948879,-0.522594,0.714004,-0.235286
17,18,1,Vic,m,-0.955976,0.219723,-0.408254,90.0,-0.534944,1.207301,1.798509,0.236859,0.465334,-0.235286
12,13,1,Vic,m,0.618267,0.676703,0.952166,89.5,-0.534944,0.59249,0.412439,0.711518,-0.032007,-0.235286


In [13]:
lm = ols('totlngth ~  age  + skullw + taill + footlgth + earconch + eye + chest + belly + hdlngth', data=df).fit()
sm.stats.anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
age,1.0,126.78533,126.78533,24.370614,3.580173e-06
skullw,1.0,354.197246,354.197246,68.083621,1.153203e-12
taill,1.0,354.295634,354.295634,68.102533,1.146895e-12
footlgth,1.0,346.807161,346.807161,66.663102,1.744427e-12
earconch,1.0,1.480052,1.480052,0.284495,0.5950705
eye,1.0,5.745371,5.745371,1.104372,0.2960913
chest,1.0,29.307372,29.307372,5.633449,0.01972404
belly,1.0,1.609418,1.609418,0.309361,0.5794374
hdlngth,1.0,67.670181,67.670181,13.007529,0.0005063671
Residual,91.0,473.417086,5.202386,,


In [17]:
lm = ols('totlngth ~  skullw + taill + footlgth + chest + hdlngth', data=df).fit()
sm.stats.anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
skullw,1.0,451.641922,451.641922,89.351067,2.44496e-15
taill,1.0,364.550713,364.550713,72.121283,2.732552e-13
footlgth,1.0,355.066151,355.066151,70.244895,4.705647e-13
chest,1.0,31.738161,31.738161,6.278953,0.01391875
hdlngth,1.0,78.122377,78.122377,15.455425,0.000160434
Residual,95.0,480.195527,5.05469,,


### En el momento de elegir las variables predictoras, hicimos una tabla de correlación y vimos que era muy alta entre skull y hdlngth, por lo que decidimos quedarnos sólo con una de ellas(hdlngth).
### tras el anova, parece que tiene más efecto en nuestra variable respuesta, la predictora skull.
### A tener en cuenta a la hora de realizar nuestro modelo. 

In [15]:
lm = ols('totlngth ~  age ', data=df).fit()
sm.stats.anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
age,1.0,126.78533,126.78533,7.67912,0.006673
Residual,99.0,1634.529521,16.510399,,


In [None]:
lm = ols('totlngth ~  site + Pop + sex + age', data=df).fit()
sm.stats.anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
site,6.0,812.524176,135.420696,14.140683,2.219322e-11
Pop,1.0,7.106079,7.106079,0.74202,0.3912302
sex,1.0,6.436047,6.436047,0.672055,0.4144323
age,1.0,63.845982,63.845982,6.666823,0.01138393
Residual,93.0,890.630606,9.576673,,


### Podemos descartar Pop y sex como variables que inlfuyan en nuestra variable respuesta. En cambio la edad (age) y el sitio de muestreo (site) sí que son variables a tener en cuenta. 
### Tiene sentido lógico que la edad influye, teneindo en cuenta el crecimiento natural de los animales... pero nos sorprende que el sitio dónde crecen sea inlfuyente. Una variable a estudiar y tener en cuenta. 

### Todas nuestras variables afectan a la variable respuesta, excepto tres: earconch, eye, belly, cuyo PR (p-valor) es superior a 0,05.
### Según F las variables que mejor capacidad para explicar nuestra variable respuesta son taill, skull y footlgth en ese orden. 
#### F = test que se utiliza para evaluar la capacidad explicativa que tiene la variable predictora sobre la variación de la variable respuesta. Es decir, pretende determinar si de entre todos los valores de la variable predictora, al menos una tiene capacidad de explicar una parte significativa de la variación de la variable respuesta.)


In [None]:
lm = ols('totlngth ~  site + age', data=df).fit()
sm.stats.anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
site,6.0,812.524176,135.420696,14.172989,1.960292e-11
age,1.0,68.683651,68.683651,7.188359,0.008666759
Residual,94.0,898.155311,9.554844,,


In [None]:
lm = ols('totlngth ~  site', data=df).fit()
sm.stats.anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
site,6.0,907.588591,151.264765,14.581728,8.452489e-12
Residual,97.0,1006.237563,10.373583,,


In [None]:
df["age_cat"]= df["age"].astype("object")

In [19]:
lm = ols('totlngth ~  hdlngth', data=df).fit()
sm.stats.anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
hdlngth,1.0,791.602137,791.602137,80.816318,1.743704e-14
Residual,99.0,969.712715,9.795078,,


In [20]:
lm.summary()

0,1,2,3
Dep. Variable:,totlngth,R-squared:,0.449
Model:,OLS,Adj. R-squared:,0.444
Method:,Least Squares,F-statistic:,80.82
Date:,"Thu, 29 Sep 2022",Prob (F-statistic):,1.74e-14
Time:,12:17:14,Log-Likelihood:,-257.54
No. Observations:,101,AIC:,519.1
Df Residuals:,99,BIC:,524.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,87.2693,0.311,280.232,0.000,86.651,87.887
hdlngth,2.7996,0.311,8.990,0.000,2.182,3.417

0,1,2,3
Omnibus:,0.344,Durbin-Watson:,1.65
Prob(Omnibus):,0.842,Jarque-Bera (JB):,0.513
Skew:,-0.079,Prob(JB):,0.774
Kurtosis:,2.689,Cond. No.,1.0


### COCLUSIONES: en nuestro modelo deberíamos invcluir las variables más significativas que son: site ( como variable categórica), age (que la procesamos tanto como numérica, cómo categórica)
### interpretar conclusiones!!! ¿Qué hacemos con age??