## 0. Libraries 

In [1]:
import pandas as pd
import matplotlib
import numpy as np  
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency 

## 1. Load Data 

In [2]:
#Load the data 
df = pd.read_csv("data/clean/question_1_translated.csv")

### 2. Data Manipulation

In [3]:
# Rename the columns
df.rename(columns = {'subcategory': 'group'}, inplace=True)

In [4]:
age_df = df[df['category'] == "Age range"]
region_df = df[df['category'] == "Region"]
race_df = df[df['category'] == "Color/race"]
education_df = df[df['category'] == "Education"]
income_df = df[df['category'] == "Gross family income"]
religion_df = df[df['category'] == "Religion"]
domicile_df = df[df['category'] == "Domicile situation"]


filtered_age_df = age_df[age_df['answer'].isin(['Yes', 'No'])]
filtered_region_df = region_df[region_df['answer'].isin(['Yes', 'No'])]
filtered_race_df = race_df[race_df['answer'].isin(['Yes', 'No'])]
filtered_education_df = education_df[education_df['answer'].isin(['Yes', 'No'])]
filtered_income_df = income_df[income_df['answer'].isin(['Yes', 'No'])]
filtered_religion_df = religion_df[religion_df['answer'].isin(['Yes', 'No'])]
filtered_domicile_df = domicile_df[domicile_df['answer'].isin(['Yes', 'No'])]

# Pivot the DataFrame to create a contingency table
age_contingency_table = filtered_age_df.pivot_table(index='group',columns='answer',values='percentage',fill_value=0)
region_contingency_table = filtered_region_df.pivot_table(index='group',columns='answer',values='percentage',fill_value=0)
race_contingency_table = filtered_race_df.pivot_table(index='group',columns='answer',values='percentage',fill_value=0)
education_contingency_table = filtered_education_df.pivot_table(index='group',columns='answer',values='percentage',fill_value=0)
income_contingency_table = filtered_income_df.pivot_table(index='group',columns='answer',values='percentage',fill_value=0)
religion_contingency_table = filtered_religion_df.pivot_table(index='group',columns='answer',values='percentage',fill_value=0)
domicile_contingency_table = filtered_domicile_df.pivot_table(index='group',columns='answer',values='percentage',fill_value=0)

# Reset the index for better readability
age_contingency_table.reset_index(inplace=True)
region_contingency_table.reset_index(inplace=True)
race_contingency_table.reset_index(inplace=True)
education_contingency_table.reset_index(inplace=True)
income_contingency_table.reset_index(inplace=True)
religion_contingency_table.reset_index(inplace=True)
domicile_contingency_table.reset_index(inplace=True)




# Grouping by category, subcategory, and answer, then averaging percentage over all years
age_avg_df = age_df.groupby(["group"])["percentage"].mean().reset_index()
region_avg_df = region_df.groupby(["group"])["percentage"].mean().reset_index
race_avg_df = race_df.groupby(["group"])["percentage"].mean().reset_index
education_avg_df = education_df.groupby(["group"])["percentage"].mean().reset_index
income_avg_df = income_df.groupby(["group"])["percentage"].mean().reset_index
religion_avg_df = religion_df.groupby(["group"])["percentage"].mean().reset_index
domicile_avg_df = domicile_df.groupby(["group"])["percentage"].mean().reset_index



In [5]:
age_contingency_table.set_index('group', inplace=True)
region_contingency_table.set_index('group', inplace=True)
race_contingency_table.set_index('group', inplace=True) 
education_contingency_table.set_index('group', inplace=True)
income_contingency_table.set_index('group', inplace=True)
religion_contingency_table.set_index('group', inplace=True)
domicile_contingency_table.set_index('group', inplace=True)

In [10]:
_, age_p, _, _ = chi2_contingency(age_contingency_table)
_, region_p, _, _ = chi2_contingency(region_contingency_table)
_, race_p, _, _ =  chi2_contingency(race_contingency_table)
_, education_p, _, _ = chi2_contingency(education_contingency_table)
_, income_p, _, _ = chi2_contingency(income_contingency_table)
_, religion_p, _, _ = chi2_contingency(religion_contingency_table)
_, domicile_p, _, _ = chi2_contingency(domicile_contingency_table)


## Question 1

'Você já sofreu algum tipo de violência doméstica ou familiar provocada por um homem?'

'Have you ever suffered any type of domestic or family violence caused by a man?'

In [115]:
df.category.value_counts()

category
Age range              145
Region                 145
Education               87
Color/race              60
Gross family income      9
Religion                 9
Domicile situation       6
Name: count, dtype: int64

#### Hypothesis Test 

In [12]:
# Hypotheses
null_hypothesis = "There is no relationship between group and response type (they are independent)."  # H₀
alt_hypothesis = "There is a relationship between group and response type (they are dependent)." # H₁
print(f"Null Hypothesis (H₀): {null_hypothesis}")
print(f"Alternate Hypothesis (H₁): {alt_hypothesis}")

Null Hypothesis (H₀): There is no relationship between group and response type (they are independent).
Alternate Hypothesis (H₁): There is a relationship between group and response type (they are dependent).


In [20]:
# Assume the following dictionaries hold your contingency tables
contingency_tables = {
    "Age": age_contingency_table,
    "Region": region_contingency_table,
    "Race": race_contingency_table,
    "Education": education_contingency_table,
    "Income": income_contingency_table,
    "Religion": religion_contingency_table,
    "Domicile": domicile_contingency_table,
}

# Iterate through each contingency table and perform the Chi-Square Test
for category, table in contingency_tables.items():
    chi2_stat, p_value, _, _ = chi2_contingency(table)
    
    print(f"Results for {category}:")
    print(f"Chi-Square Statistic: {chi2_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    
    if p_value < 0.05:
        print("Reject the null hypothesis; there is a significant relationship between the variables.")
    else:
        print("Fail to reject the null hypothesis; there is no significant relationship between the variables.")
    print()  

Results for Age:
Chi-Square Statistic: 3.0495
P-value: 0.5496
Fail to reject the null hypothesis; there is no significant relationship between the variables.

Results for Region:
Chi-Square Statistic: 0.0527
P-value: 0.9997
Fail to reject the null hypothesis; there is no significant relationship between the variables.

Results for Race:
Chi-Square Statistic: 1.1439
P-value: 0.5644
Fail to reject the null hypothesis; there is no significant relationship between the variables.

Results for Education:
Chi-Square Statistic: 3.6723
P-value: 0.1594
Fail to reject the null hypothesis; there is no significant relationship between the variables.

Results for Income:
Chi-Square Statistic: 5.6299
P-value: 0.0599
Fail to reject the null hypothesis; there is no significant relationship between the variables.

Results for Religion:
Chi-Square Statistic: 2.8327
P-value: 0.2426
Fail to reject the null hypothesis; there is no significant relationship between the variables.

Results for Domicile:
Chi-Sq