In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as smf
import statsmodels.formula.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [41]:
possum = pd.read_csv('Possum.csv', na_values='NA')
possum.footlgth.fillna(possum.footlgth.mean(skipna=True), inplace=True)
possum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   site         104 non-null    int64  
 1   Pop          104 non-null    object 
 2   gender       104 non-null    object 
 3   age          102 non-null    float64
 4   head_length  104 non-null    float64
 5   skull_width  104 non-null    float64
 6   totlngth     104 non-null    float64
 7   tail_length  104 non-null    float64
 8   footlgth     104 non-null    float64
 9   earconch     104 non-null    float64
 10  eye          104 non-null    float64
 11  chest        104 non-null    float64
 12  belly        104 non-null    float64
dtypes: float64(10), int64(1), object(2)
memory usage: 10.7+ KB


In [30]:
possumQuant = possum.drop(columns= ['site', 'Pop', 'gender', 'age'])

In [31]:
X1 = add_constant(possumQuant)
X1

Unnamed: 0,const,head_length,skull_width,totlngth,tail_length,footlgth,earconch,eye,chest,belly
0,1.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,1.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,1.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,1.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,1.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0
...,...,...,...,...,...,...,...,...,...,...
99,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
100,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
101,1.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
102,1.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0


In [32]:
VIF = pd.DataFrame()
VIF['Features'] = X1.columns
VIF['VIF'] = [variance_inflation_factor(X1.values, i) for i in range(len(X1.columns))]

In [33]:
VIF

Unnamed: 0,Features,VIF
0,const,1304.101799
1,head_length,3.174731
2,skull_width,2.416277
3,totlngth,3.719525
4,tail_length,2.37455
5,footlgth,3.637942
6,earconch,3.401525
7,eye,1.234146
8,chest,2.517906
9,belly,1.82025


In [37]:
#No attr to be removed
possum.age.dropna(inplace=True)
possum.isna().sum()

site           0
Pop            0
gender         0
age            2
head_length    0
skull_width    0
totlngth       0
tail_length    0
footlgth       0
earconch       0
eye            0
chest          0
belly          0
dtype: int64

In [49]:
ohe = pd.get_dummies(data=possum, columns=['Pop', 'gender', 'site'])

In [51]:
ohe.columns

Index(['age', 'head_length', 'skull_width', 'totlngth', 'tail_length',
       'footlgth', 'earconch', 'eye', 'chest', 'belly', 'Pop_Vic', 'Pop_other',
       'gender_f', 'gender_m', 'site_1', 'site_2', 'site_3', 'site_4',
       'site_5', 'site_6', 'site_7'],
      dtype='object')

In [62]:
fit2 = sm.ols('age~Pop_Vic+Pop_other+gender_f+gender_m', data=ohe).fit()

In [63]:
fit2.summary()

0,1,2,3
Dep. Variable:,age,R-squared:,0.01
Model:,OLS,Adj. R-squared:,-0.01
Method:,Least Squares,F-statistic:,0.4854
Date:,"Tue, 28 Mar 2023",Prob (F-statistic):,0.617
Time:,13:30:26,Log-Likelihood:,-209.7
No. Observations:,102,AIC:,425.4
Df Residuals:,99,BIC:,433.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.9339,0.097,19.993,0.000,1.742,2.126
Pop_Vic,1.1135,0.207,5.372,0.000,0.702,1.525
Pop_other,0.8204,0.197,4.155,0.000,0.429,1.212
gender_f,1.0588,0.209,5.066,0.000,0.644,1.474
gender_m,0.8751,0.197,4.446,0.000,0.485,1.266

0,1,2,3
Omnibus:,5.135,Durbin-Watson:,1.646
Prob(Omnibus):,0.077,Jarque-Bera (JB):,4.993
Skew:,0.492,Prob(JB):,0.0824
Kurtosis:,2.546,Cond. No.,2.76e+16


In [80]:
fit3 = sm.ols('head_length~skull_width+totlngth+Pop_Vic+Pop_other+gender_f+gender_m+site_1+ site_2', data=ohe).fit()

In [81]:
fit3.summary()

0,1,2,3
Dep. Variable:,head_length,R-squared:,0.665
Model:,OLS,Adj. R-squared:,0.648
Method:,Least Squares,F-statistic:,38.95
Date:,"Tue, 28 Mar 2023",Prob (F-statistic):,7.58e-22
Time:,13:38:09,Log-Likelihood:,-222.6
No. Observations:,104,AIC:,457.2
Df Residuals:,98,BIC:,473.1
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,13.8096,2.506,5.510,0.000,8.836,18.783
skull_width,0.5162,0.082,6.313,0.000,0.354,0.678
totlngth,0.3919,0.070,5.604,0.000,0.253,0.531
Pop_Vic,5.5759,0.991,5.627,0.000,3.609,7.542
Pop_other,8.2338,1.542,5.338,0.000,5.173,11.295
gender_f,6.3562,1.285,4.945,0.000,3.805,8.907
gender_m,7.4534,1.260,5.917,0.000,4.954,9.953
site_1,2.8337,0.774,3.660,0.000,1.297,4.370
site_2,2.7422,0.499,5.492,0.000,1.751,3.733

0,1,2,3
Omnibus:,1.869,Durbin-Watson:,2.004
Prob(Omnibus):,0.393,Jarque-Bera (JB):,1.438
Skew:,-0.0,Prob(JB):,0.487
Kurtosis:,3.576,Cond. No.,3.01e+18
