In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from statsmodels.discrete.discrete_model import Poisson
from sklearn.metrics import mean_squared_error as mse

In [2]:
df = pd.read_excel('bmi_data.xlsx')
df

Unnamed: 0,Gender,Height,Weight,BMI,Number
0,Male,198,50,12.753801,1
1,Female,198,50,12.753801,2
2,Female,196,50,13.015410,3
3,Female,190,50,13.850416,4
4,Male,190,50,13.850416,5
...,...,...,...,...,...
495,Female,140,146,74.489796,496
496,Male,140,146,74.489796,497
497,Male,145,160,76.099881,498
498,Male,140,152,77.551020,499


In [3]:
#Create bins
height_number_of_bins = 10
weight_number_of_bins = 10

height_range_step = (df['Height'].max() - df['Height'].min() + 1)/height_number_of_bins
weight_range_step = (df['Weight'].max() - df['Weight'].min() + 1)/weight_number_of_bins

#Create grid of bins
grid_dim  = (height_number_of_bins, weight_number_of_bins)
bins_grid = np.zeros(grid_dim)
bins_grid_even = np.zeros(grid_dim)
bins_grid_odd  = np.zeros(grid_dim)

height_column = df['Height']
weight_column = df['Weight']
number_column = df['Number']

for i in range(df.shape[0]):
    grid_bin_height_idx = int(( height_column[i] - df['Height'].min() ) // height_range_step)
    grid_bin_weight_idx = int(( weight_column[i] - df['Weight'].min() ) // weight_range_step)
    bins_grid[grid_bin_height_idx][grid_bin_weight_idx] += 1

    #Even count
    if number_column[i] % 2 == 0:
        bins_grid_even[grid_bin_height_idx][grid_bin_weight_idx] += 1
    #Odd count
    else:
        bins_grid_odd[grid_bin_height_idx][grid_bin_weight_idx] += 1

bins_grid

array([[ 1.,  3.,  7.,  8.,  1.,  2.,  3.,  6.,  8.,  3.],
       [ 7.,  8.,  2.,  6.,  5.,  4.,  3.,  1.,  4.,  5.],
       [ 9.,  3.,  5.,  3.,  7.,  9.,  2.,  4.,  6.,  2.],
       [ 4.,  4.,  3.,  1.,  6.,  4.,  6.,  5.,  5.,  9.],
       [ 7.,  7.,  5.,  5.,  6.,  4.,  4.,  4., 10.,  8.],
       [ 2.,  3.,  4.,  4.,  6.,  7.,  3.,  4.,  6.,  3.],
       [10.,  3.,  9.,  4.,  4.,  7.,  9.,  2.,  5.,  9.],
       [ 5.,  4.,  8.,  8.,  8.,  3.,  7.,  8., 11.,  7.],
       [ 7.,  5.,  5.,  3.,  9.,  3.,  5.,  4.,  4.,  2.],
       [ 4.,  4.,  1.,  2.,  4.,  7.,  4.,  3.,  3.,  4.]])

In [4]:
#Prepare numpy dataframe row values for Poisson Regression
height_bin_values = []
weight_bin_values = []

for i in range(height_number_of_bins):
    height_bin_values.append( df['Height'].min() + height_range_step*i )

for i in range(weight_number_of_bins):
    weight_bin_values.append( df['Weight'].min() + weight_range_step*i )

print(height_bin_values)
print(weight_bin_values)

[140.0, 146.0, 152.0, 158.0, 164.0, 170.0, 176.0, 182.0, 188.0, 194.0]
[50.0, 61.1, 72.2, 83.3, 94.4, 105.5, 116.6, 127.7, 138.8, 149.89999999999998]


In [5]:
bmi_data      = []
bmi_data_even = []
bmi_data_odd  = []

for i in range(height_number_of_bins):
    for j in range(weight_number_of_bins):
        bmi_data.append([ height_bin_values[i], weight_bin_values[j], bins_grid[i][j] ])
        bmi_data_even.append([ height_bin_values[i], weight_bin_values[j], bins_grid_even[i][j] ])
        bmi_data_odd.append([ height_bin_values[i], weight_bin_values[j], bins_grid_odd[i][j] ])

#Create DataFrame for all data
bmi_df = pd.DataFrame(bmi_data, columns = ['Height', 'Weight', 'Count'])
#Create DataFrame for data in even positions
bmi_df_even = pd.DataFrame(bmi_data_even, columns = ['Height', 'Weight', 'Count'])
#Create DataFrame for data in odd positions
bmi_df_odd = pd.DataFrame(bmi_data_odd, columns = ['Height', 'Weight', 'Count'])

bmi_df

Unnamed: 0,Height,Weight,Count
0,140.0,50.0,1.0
1,140.0,61.1,3.0
2,140.0,72.2,7.0
3,140.0,83.3,8.0
4,140.0,94.4,1.0
...,...,...,...
95,194.0,105.5,7.0
96,194.0,116.6,4.0
97,194.0,127.7,3.0
98,194.0,138.8,3.0


In [6]:
# Adding variables to the even-numbered data
bmi_df_even['h^2'] = bmi_df_even['Height']**2
bmi_df_even['w^2'] = bmi_df_even['Weight']**2
bmi_df_even['hw'] = bmi_df_even['Height'] * bmi_df_even['Weight']

# Adding variables to the odd-numbered data
bmi_df_odd['h^2'] = bmi_df_odd['Height']**2
bmi_df_odd['w^2'] = bmi_df_odd['Weight']**2
bmi_df_odd['hw'] = bmi_df_odd['Height'] * bmi_df_odd['Weight']

In [7]:
# Poisson Regression to even-numbered data
poi_fit_even = Poisson(bmi_df_even['Count'], bmi_df_even[['Height','Weight','h^2','w^2','hw']]).fit()

Optimization terminated successfully.
         Current function value: 1.754953
         Iterations 6


In [10]:
# Prediction of odd-numbered data with the even-numbered data Poisson Regression model fit
bmi_df_odd['Poi_Predict'] = poi_fit_even.predict(bmi_df_odd[['Height','Weight','h^2','w^2','hw']])

# Generating a dataframe with odd-numbered counts observed, predicted and even counts
count_df = pd.DataFrame(bmi_df_odd, columns=['Count', 'Poi_Predict'])
count_df.rename(columns={'Count':'Odd Counts', 'Poi_Predict':'Odd Predicted (w/ Even model fit)'}, inplace=True)
count_df['Even Counts'] = bmi_df_even['Count']
count_df

Unnamed: 0,Odd Counts,Odd Predicted (w/ Even model fit),Even Counts
0,0.0,2.630866,1.0
1,2.0,2.518908,1.0
2,5.0,2.429474,2.0
3,6.0,2.360469,2.0
4,0.0,2.310312,1.0
...,...,...,...
95,4.0,2.431182,3.0
96,1.0,2.523756,3.0
97,2.0,2.639146,1.0
98,1.0,2.780134,2.0


In [11]:
# Getting MSE of Observed_odd-numbered_Counts vs Predicted_odd-numbered_Counts_with_even-numbered_data_fit
MSE_oo_po = mse(bmi_df_odd['Count'], bmi_df_odd['Poi_Predict'])

# Getting MSE of Observed_odd-numbered_Counts vs Observed_even-numbered_Counts
MSE_oo_oe = mse(bmi_df_odd['Count'], bmi_df_even['Count'])

(MSE_oo_po, MSE_oo_oe)

(3.2589993745018266, 5.08)

#### Based on the MSE results, we can conclude that the even-numbered subjects data Poisson Regression model (smaller MSE) is a better fit in the odd numbered data than the raw data of even numbered subjects as a model.