# Analysis of contingency tables

## Libraries and settings

In [None]:
# Libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2
from scipy.stats import chi2_contingency

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Get current working directory
print(os.getcwd())

## Import the apartment data

In [None]:
# Define columns
columns = ['web-scraper-order',
            'address_raw',
            'lat',
            'lon',
            'bfs_number',
            'bfs_name',
            'rooms', 
            'area', 
            'luxurious', 
            'price', 
            'price_per_m2',
            'pop_dens',
            'frg_pct',
            'mean_taxable_income',
            'dist_supermarket']

# Read and select variables
df_orig = pd.read_csv("apartments_data_enriched_cleaned.csv", sep=";")[columns]

# Rename variable 'web-scraper-order' to 'apmt_id'
df_orig = df_orig.rename(columns={'web-scraper-order': 'apartment_id'})

# Remove missing values
df = df_orig.dropna()
df.head(5)

# Remove duplicates
df = df.drop_duplicates()
df.head(5)

## Create a new categorical variable based on the price_per_m2

### Plot histogram of price_per_m2

In [None]:
# Plot Histogram
fig = plt.figure( figsize=(7,4))
plt.xticks(fontsize=14, rotation=0)
plt.yticks(fontsize=14, rotation=0)
n, bins, patches = plt.hist(x=df['price_per_m2'], 
                            bins=20, 
                            color='#42AD12',
                            alpha=0.5, 
                            rwidth=0.95
                   )
plt.grid(True)
plt.ticklabel_format(style='plain')
plt.grid(axis='y', alpha=0.75)

# Set labels
plt.xlabel('price_per_m2', fontsize=10, labelpad=10)
plt.ylabel('Frequency', fontsize=10, labelpad=10)
plt.title('Histogram of price_per_m2', fontsize=12, pad=10)

# Set fontsize of tick labels
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)

plt.show()

### Show quantiles of price_per_m2

In [None]:
qt_price = df['price_per_m2'].quantile(q=[0.10, 0.25, 0.50, 0.75, 0.90]).round(1)
qt_price

### Create new categorical variable 'price_cat'

In [None]:
# Create labels
labels = ['0 - 19', '20 - 24', '25 - 29', '30 - 34', '>= 35']

# Create new categorical variable
df["price_cat"] = pd.cut(df.price_per_m2, bins=[0, 20, 25, 30, 35, 150], labels=labels)

# Check values
df[['price_per_m2', 'price_cat']].head(10)

## Create a new categorical variable based on the living area

### Plot histogram of area

In [None]:
# Plot Histogram
fig = plt.figure( figsize=(7,4))
plt.xticks(fontsize=14, rotation=0)
plt.yticks(fontsize=14, rotation=0)
n, bins, patches = plt.hist(x=df['area'], 
                            bins=20, 
                            color='blue',
                            alpha=0.5, 
                            rwidth=0.95
                   )
plt.grid(True)
plt.ticklabel_format(style='plain')
plt.grid(axis='y', alpha=0.75)

# Set labels
plt.xlabel('area (m2)', fontsize=10, labelpad=10)
plt.ylabel('Frequency', fontsize=10, labelpad=10)
plt.title('Histogram of area', fontsize=12, pad=10)

# Set fontsize of tick labels
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)

plt.show()

### Show quantiles of area

In [None]:
qt_area = df['area'].quantile(q=[0.10, 0.25, 0.50, 0.75, 0.90]).round(1)
qt_area

### Create new categorical variable 'area_cat'

In [None]:
# Create labels
labels = ['0 - 49', '50 - 74', '75 - 99', '100 - 149', '>= 150']

# Create new categorical variable
df["area_cat"] = pd.cut(df['area'], bins=[0, 50, 75, 100, 150, 1000], labels=labels)

# Check values
df[['area', 'area_cat']].head(10)

## Create a contingency table with the number of apartments per 'area_cat' versus 'price_cat'

In [None]:
# Contingency table (numbers in the table are apartments)
tab = pd.pivot_table(df[['apartment_id', 'area_cat', 'price_cat']],
                     index=['area_cat'],
                     columns=['price_cat'], 
                     aggfunc='count')
tab

## Perform a Chi-square test on the contingency table
For details see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html

In [None]:
# 1-alpha
prob = 0.95

# Chi-squared test
stat, p, dof, expected = chi2_contingency(tab)

# Critical value
critical = chi2.ppf(prob, dof)

# Print results of the Chi-square test
print('Expected frequencies:', '\n', expected.round(4), '\n')
print('Critical value:', critical.round(4), '\n')
print('Test statistic:', stat.round(4), '\n')
print('Degrees of freedom:', dof, '\n')
print(f'p-value: {p:.4f}')

## Interpretation of the Chi-squared test result

<p>Because the test statistic is larger than the critical value and the p-value is smaller than our significance level apha=0.05, we can reject the null hypothesis, i.e. there is a statistically significant (at the 0.05 significance-level) association between the price-categories and the living-area categories in the contingence table.</p>

## Applying the Chi-squared test to animal behavior

For full study see: https://davida-rosenstrauch.medium.com/applying-the-chi-squared-test-to-animal-behavior-b6cca3f930b8

<img src="https://miro.medium.com/max/640/1*h5JgsNbUy2tuiLgSrkIlyA.png" width="500" align="left"/>

### Contingency table, spotted hyena example

In [None]:
# The numpy array shows how many times each hyena responded (see study above)
table = np.array([[0,4],
                  [7,6],
                  [9,0],
                  [15,10]])

# Create a data frame from 'table'
df_hy = pd.DataFrame(table, columns=[['Female', 'Male']],
             index=['Fear', 'Greeting', 'Aggression', 'No Response'])
df_hy

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

## Chi-Quadrat Analyse

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Kontingenztafel erstellen (Zahlen in der Tabelle sind die beobachteten Werte)
table = np.array([[0, 4],
                  [7, 6],
                  [9, 0],
                  [15, 10]])

# Chi-Quadrat-Test durchführen
stat, p, dof, expected = chi2_contingency(table)

# Ergebnisse des Chi-Quadrat-Tests ausgeben
print('Expected frequencies:', '\n', expected.round(4), '\n')
print('Test statistic:', stat.round(4), '\n')
print('Degrees of freedom:', dof, '\n')
print(f'p-value: {p:.4f}')

# Interpretation des Chi-Quadrat-Testergebnisses
alpha = 0.05
if p < alpha:
    print('Es besteht ein statistisch signifikanter Zusammenhang zwischen den Kategorien.')
else:
    print('Es besteht kein statistisch signifikanter Zusammenhang zwischen den Kategorien.')

Expected frequencies: 
 [[ 2.4314  1.5686]
 [ 7.902   5.098 ]
 [ 5.4706  3.5294]
 [15.1961  9.8039]] 

Test statistic: 12.2754 

Degrees of freedom: 3 

p-value: 0.0065
Es besteht ein statistisch signifikanter Zusammenhang zwischen den Kategorien.


## Calculates the expected frequencies of the contingency table

In [2]:
import numpy as np

def calculate_expected_frequencies(observed_table):
    row_totals = np.sum(observed_table, axis=1)
    col_totals = np.sum(observed_table, axis=0)
    total = np.sum(observed_table)
    
    expected_table = np.empty_like(observed_table, dtype=float)
    
    for i in range(observed_table.shape[0]):
        for j in range(observed_table.shape[1]):
            expected_table[i, j] = (row_totals[i] * col_totals[j]) / total
    
    return expected_table

# Beispielaufruf der Funktion mit deiner Kontingenztafel
observed_table = np.array([[0, 4],
                           [7, 6],
                           [9, 0],
                           [15, 10]])

expected_frequencies = calculate_expected_frequencies(observed_table)
print('Expected frequencies:', '\n', expected_frequencies.round(4), '\n')

Expected frequencies: 
 [[ 2.4314  1.5686]
 [ 7.902   5.098 ]
 [ 5.4706  3.5294]
 [15.1961  9.8039]] 



## Compare calculated expected frequencies

Methode: Die erwarteten Frequenzen werden durch eine eigene Funktion berechnet, indem die Formel für den Chi-Quadrat-Test verwendet wird. Ergebnis: expected_frequencies enthält die berechneten erwarteten Frequenzen für jede Zelle der Kontingenztafel. Verwendung von chi2_contingency():

Methode: Die erwarteten Frequenzen werden durch die chi2_contingency() Funktion aus der scipy.stats Bibliothek berechnet. Ergebnis: expected enthält die erwarteten Frequenzen für jede Zelle der Kontingenztafel, die von chi2_contingency() berechnet werden.

## Extent the Python funtcion under c) to additionally provide the Chi-Square test statistic

In [4]:
import numpy as np

def calculate_expected_frequencies_and_chi2(observed_table):
    row_totals = np.sum(observed_table, axis=1)
    col_totals = np.sum(observed_table, axis=0)
    total = np.sum(observed_table)
    
    expected_table = np.empty_like(observed_table, dtype=float)
    
    for i in range(observed_table.shape[0]):
        for j in range(observed_table.shape[1]):
            expected_table[i, j] = (row_totals[i] * col_totals[j]) / total
    
    chi2_statistic = np.sum((observed_table - expected_table)**2 / expected_table)
    
    return expected_table, chi2_statistic

# Beispielaufruf der Funktion mit deiner Kontingenztafel
observed_table = np.array([[0, 4],
                           [7, 6],
                           [9, 0],
                           [15, 10]])

expected_frequencies, chi2_statistic = calculate_expected_frequencies_and_chi2(observed_table)

print('Erwartete Frequenzen:', '\n', expected_frequencies.round(4), '\n')
print('Chi-Quadrat-Test-Statistik:', chi2_statistic)

Erwartete Frequenzen: 
 [[ 2.4314  1.5686]
 [ 7.902   5.098 ]
 [ 5.4706  3.5294]
 [15.1961  9.8039]] 

Chi-Quadrat-Test-Statistik: 12.27543424317618


## Compare your calculated Chi-square test statistic with the Chi-Square test statistic from the chi2_contingency() mehtod in Pyhton

Ersichtlicher Unterschied: Chi-Quadrat-Test Statistik

## In the Jupyter notebook, state in one sentence whether the results of the original study are correct or not

Die Resultate sollten richtig sein