### **TASK 4: STATISTICAL ANALYSIS USING KAGGLE DATASET**

#### **Data Ingestion**

In [None]:
# Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

In [188]:
# importing the dataset directly from kagglehub
import kagglehub

path = kagglehub.dataset_download("abrambeyer/openintro-possum")
print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/abrambeyer/openintro-possum?dataset_version_number=1...


100%|██████████| 2.15k/2.15k [00:00<00:00, 952kB/s]

Extracting files...





Path to dataset files: C:\Users\ncc\.cache\kagglehub\datasets\abrambeyer\openintro-possum\versions\1


In [None]:
# load the data and show the first five
data = pd.read_csv('possum.csv')
data.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


#### **Preliminary Data Analysis**

In [168]:
# Renaming some columns for easy accessibility
data.rename(columns={'hdlngth': 'headlength',
                     'skullw': 'skullweight',
                     'totlngth': 'totallength',
                     'taill': 'taillength',
                     'footlgth': 'footlength',
                     }, inplace=True)

In [169]:
# get the shape of the data to know how many rows and columns it contains
data.shape

(104, 14)

In [170]:
# showing the columns present in the dataset
data.columns

Index(['case', 'site', 'Pop', 'sex', 'age', 'headlength', 'skullweight',
       'totallength', 'taillength', 'footlength', 'earconch', 'eye', 'chest',
       'belly'],
      dtype='object')

In [171]:
# checking the populations/locations present
data['Pop'].unique()

array(['Vic', 'other'], dtype=object)

In [None]:
# get more information on the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   case         104 non-null    int64  
 1   site         104 non-null    int64  
 2   Pop          104 non-null    object 
 3   sex          104 non-null    object 
 4   age          102 non-null    float64
 5   headlength   104 non-null    float64
 6   skullweight  104 non-null    float64
 7   totallength  104 non-null    float64
 8   taillength   104 non-null    float64
 9   footlength   103 non-null    float64
 10  earconch     104 non-null    float64
 11  eye          104 non-null    float64
 12  chest        104 non-null    float64
 13  belly        104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.5+ KB


In [None]:
# get the statistical summary of the data
data.describe()

Unnamed: 0,case,site,age,headlength,skullweight,totallength,taillength,footlength,earconch,eye,chest,belly
count,104.0,104.0,102.0,104.0,104.0,104.0,104.0,103.0,104.0,104.0,104.0,104.0
mean,52.5,3.625,3.833333,92.602885,56.883654,87.088462,37.009615,68.459223,48.130769,15.046154,27.0,32.586538
std,30.166206,2.349086,1.909244,3.573349,3.113426,4.310549,1.959518,4.395306,4.10938,1.050374,2.045597,2.761949
min,1.0,1.0,1.0,82.5,50.0,75.0,32.0,60.3,40.3,12.8,22.0,25.0
25%,26.75,1.0,2.25,90.675,54.975,84.0,35.875,64.6,44.8,14.4,25.5,31.0
50%,52.5,3.0,3.0,92.8,56.35,88.0,37.0,68.0,46.8,14.9,27.0,32.5
75%,78.25,6.0,5.0,94.725,58.1,90.0,38.0,72.5,52.0,15.725,28.0,34.125
max,104.0,7.0,9.0,103.1,68.6,96.5,43.0,77.9,56.2,17.8,32.0,40.0


#### **Data Cleaning**

In [174]:
# check for missing values
data.isnull().sum()

case           0
site           0
Pop            0
sex            0
age            2
headlength     0
skullweight    0
totallength    0
taillength     0
footlength     1
earconch       0
eye            0
chest          0
belly          0
dtype: int64

In [175]:
#removing rows with missing values
cd  = data.dropna(inplace=True)
cd = data.reset_index(drop=True)

In [176]:
cd.shape

(101, 14)

#### **Descriptive Data Analysis**

##### **Measures of Centre**

In [177]:
cd.head()

Unnamed: 0,case,site,Pop,sex,age,headlength,skullweight,totallength,taillength,footlength,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


**Getting the Measures of Center based on their Sex**

In [179]:
#mean of male possums
cd.groupby('sex')['headlength'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,42.0,92.238095,2.537903,84.7,91.075,92.5,93.9,96.9
m,59.0,93.081356,4.06119,82.5,90.7,93.3,95.4,103.1


In [180]:
cd.groupby('sex')['skullweight'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,42.0,56.578571,2.599112,51.5,55.1,56.35,57.675,67.7
m,59.0,57.232203,3.411969,50.0,55.05,56.6,59.0,68.6


In [186]:
cd.groupby('sex')['totallength'].mean()
cd.groupby('sex')['totallength'].median()
cd.groupby('sex')['totallength'].agg(pd.Series.mode())

TypeError: Series.mode() missing 1 required positional argument: 'self'

In [None]:
cd.groupby('sex')['taill'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,42.0,37.119048,1.850542,32.0,36.0,37.75,38.375,41.0
m,59.0,37.0,2.067816,32.0,35.75,36.5,38.0,43.0


In [183]:
cd.groupby('sex')['footlength'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,42.0,69.111905,4.911321,60.3,64.85,70.45,72.8,77.9
m,59.0,67.889831,3.987683,62.0,64.5,66.5,71.5,75.0


In [184]:
cd.groupby('sex')['earconch'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,42.0,48.77381,4.123819,41.3,45.15,50.8,52.35,53.9
m,59.0,47.677966,3.986921,41.7,44.8,46.2,51.7,56.2


In [None]:
cd.groupby('sex')['eye'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,42.0,14.785714,1.028276,13.0,14.05,14.75,15.4,17.4
m,59.0,15.238983,1.047825,12.8,14.5,15.0,15.95,17.8


In [None]:
cd.groupby('sex')['chest'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,42.0,27.345238,1.862624,23.0,26.0,28.0,28.5,31.0
m,59.0,26.864407,2.118943,22.0,25.5,27.0,28.0,32.0


In [None]:
cd.groupby('sex')['belly'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f,42.0,32.940476,2.940886,25.0,31.5,33.0,34.0,40.0
m,59.0,32.423729,2.569405,27.0,30.75,32.0,35.0,38.0


**Getting the Measures of Center based on their Population**

In [None]:
cd.groupby('Pop')['skullw'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Vic,43.0,56.818605,2.499929,51.5,55.45,56.3,57.95,67.7
other,58.0,57.065517,3.501055,50.0,54.85,56.4,58.85,68.6


In [None]:
cd.groupby('Pop')['hdlngth'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Vic,43.0,92.897674,2.475257,84.7,91.7,93.3,94.55,96.3
other,58.0,92.606897,4.143728,82.5,90.025,92.4,95.025,103.1


In [None]:
cd.groupby('Pop')['skullw'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Vic,43.0,56.818605,2.499929,51.5,55.45,56.3,57.95,67.7
other,58.0,57.065517,3.501055,50.0,54.85,56.4,58.85,68.6


In [None]:
cd.groupby('Pop')['totlngth'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Vic,43.0,87.918605,4.588975,75.0,85.25,89.0,91.0,96.5
other,58.0,86.787931,3.850909,80.5,84.0,86.5,89.0,96.0


In [None]:
cd.groupby('Pop')['taill'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Vic,43.0,35.953488,1.768824,32.0,35.0,36.0,37.0,39.5
other,58.0,37.862069,1.713704,34.0,36.5,38.0,38.5,43.0


In [None]:
cd.groupby('Pop')['earconch'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Vic,43.0,52.476744,1.497114,49.4,51.5,52.2,53.4,56.2
other,58.0,44.913793,1.578968,41.3,43.825,44.9,46.0,48.0


In [None]:
cd.groupby('Pop')['footlgth'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Vic,43.0,72.437209,2.62216,62.7,71.1,72.8,73.75,77.9
other,58.0,65.403448,2.770329,60.3,63.275,65.2,66.65,73.2


In [None]:
cd.groupby('Pop')['eye'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Vic,43.0,14.869767,0.786958,13.0,14.4,14.9,15.35,16.4
other,58.0,15.184483,1.211176,12.8,14.425,15.0,16.0,17.8


In [None]:
cd.groupby('Pop')['chest'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Vic,43.0,27.627907,1.796468,23.0,27.0,28.0,28.5,31.0
other,58.0,26.646552,2.090095,22.0,25.125,26.0,28.0,32.0


In [None]:
cd.groupby('Pop')['belly'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Vic,43.0,32.790698,2.697513,25.0,31.5,33.0,34.25,40.0
other,58.0,32.525862,2.76796,27.0,30.625,32.5,34.0,39.0


**Interpretation of the Measures of Center**



#### **Measures of Spread**


In [None]:
# To get the Interquartile Range


### **Generating Visualizations**


In [None]:
fig, axes = plt.subplots()


for info in ['male', 'female']:


KeyError: 'Sex'