<a href="https://colab.research.google.com/github/AbdulMominAlam/DSA/blob/main/Hypothesis_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1) Import necessary libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm

# 2) Load and Clean Datasets
macro_raw = pd.read_csv('/content/macroog.csv')
oecd = pd.read_csv('/content/oecd_cleaned.csv')

# Clean macro data
macro_raw = macro_raw.rename(columns={'Country Name': 'Country', 'Series Name': 'Indicator'})
year_cols = [col for col in macro_raw.columns if '[YR' in col]
macro_long = macro_raw.melt(id_vars=['Country', 'Indicator'], value_vars=year_cols,
                            var_name='Year', value_name='Value')
macro_long['Year'] = macro_long['Year'].str.extract(r'(\d{4})').astype(int)
macro_long['Value'] = pd.to_numeric(macro_long['Value'], errors='coerce')
macro_long.dropna(inplace=True)

macro_wide = macro_long.groupby(['Country', 'Year', 'Indicator'], as_index=False)['Value'].mean()
macro_pivot = macro_wide.pivot(index=['Country', 'Year'], columns='Indicator', values='Value').reset_index()
macro_pivot = macro_pivot.rename(columns={
    'GDP growth (annual %)': 'GDP_Growth',
    'Inflation, consumer prices (annual %)': 'Inflation',
    'Unemployment, total (% of total labor force) (national estimate)': 'Unemployment'
})
macro_pivot = macro_pivot[['Country', 'Year', 'GDP_Growth', 'Inflation', 'Unemployment']]

# Clean OECD data
oecd = oecd.rename(columns={'TIME_PERIOD': 'Year', 'Value': 'R&D_Spend_MillionUSD'})
oecd['Year'] = pd.to_numeric(oecd['Year'], errors='coerce')
oecd['R&D_Spend_MillionUSD'] = pd.to_numeric(oecd['R&D_Spend_MillionUSD'], errors='coerce')
oecd = oecd[['Country', 'Year', 'R&D_Spend_MillionUSD']].dropna()

# Merge datasets
merged = pd.merge(oecd, macro_pivot, on=['Country', 'Year'], how='inner')
merged.dropna(inplace=True)

# 3) Hypothesis Tests

# Test 1: Correlation between Unemployment and R&D Spending
corr_unemployment, pval_unemployment = stats.pearsonr(merged['Unemployment'], merged['R&D_Spend_MillionUSD'])
print(f"Test 1 - Pearson Correlation (Unemployment vs R&D): r = {corr_unemployment:.4f}, p-value = {pval_unemployment:.4f}")

# Test 2: Two-sample t-test for High vs. Low Inflation groups
median_inflation = merged['Inflation'].median()
group_high = merged[merged['Inflation'] > median_inflation]['R&D_Spend_MillionUSD']
group_low = merged[merged['Inflation'] <= median_inflation]['R&D_Spend_MillionUSD']
t_stat, p_ttest = stats.ttest_ind(group_high, group_low, equal_var=False)
print(f"Test 2 - Two-sample t-test (High vs Low Inflation): t-statistic = {t_stat:.4f}, p-value = {p_ttest:.4f}")

# Test 3: Comparison of Coefficients for Inflation vs GDP Growth
X = sm.add_constant(merged[['GDP_Growth', 'Inflation']])
y = merged['R&D_Spend_MillionUSD']
model = sm.OLS(y, X).fit()

coef_gdp = model.params['GDP_Growth']
coef_inflation = model.params['Inflation']
se_gdp = model.bse['GDP_Growth']
se_inflation = model.bse['Inflation']

# Calculate t-statistic for difference
diff = coef_inflation - coef_gdp
se_diff = np.sqrt(se_gdp**2 + se_inflation**2)
t_diff = diff / se_diff
p_diff = 2 * (1 - stats.t.cdf(abs(t_diff), df=len(merged) - 3))

print(f"Test 3 - Coefficient Comparison (Inflation vs GDP Growth): t-statistic = {t_diff:.4f}, p-value = {p_diff:.4f}")
