In [1]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
cross_sect = pd.read_csv('https://raw.githubusercontent.com/ChacoGolden/Stat-Data-Science/refs/heads/main/cross_sect_inc_waste_edu_incxedu.csv')
env_awareness = pd.read_csv('https://raw.githubusercontent.com/ChacoGolden/Stat-Data-Science/refs/heads/main/env_awareness.csv')
ave_infl = pd.read_csv('https://raw.githubusercontent.com/ChacoGolden/Stat-Data-Science/refs/heads/main/ave_infl_2.csv')
urban_perc = pd.read_csv('https://raw.githubusercontent.com/ChacoGolden/Stat-Data-Science/refs/heads/main/Urbanization.csv')

In [3]:
# Find countries in cross_sect but not in env_awareness
missing_countries = set(cross_sect['Country']) - set(env_awareness['Country'])

# Display missing countries
print("Countries in cross_sect but missing in env_awareness:")
print(missing_countries)

Countries in cross_sect but missing in env_awareness:
{'Türkiye', 'Norway'}


In [4]:
# Perform a left join to merge env_awareness into cross_sect
merged_data = pd.merge(cross_sect, env_awareness, on='Country', how='left')

# Drop rows where 'yes_perc' is missing
merged_data = merged_data.dropna(subset=['yes_perc'])

# Preview the updated dataset
print("Preview of the dataset after dropping missing values in 'yes_perc':")
print(merged_data.head())


Preview of the dataset after dropping missing values in 'yes_perc':
    Country  Income Average  Waste Average  Average_Education_Level  \
0   Belgium         24451.9     447.059027                   0.3627   
1  Bulgaria          4286.8     400.376196                   0.2510   
2   Czechia          9730.2     443.899034                   0.2167   
3   Denmark         30502.6     607.350966                   0.3279   
4   Germany         23197.7     452.967933                   0.2595   

   interaction  yes_perc  
0   8868.70413      0.63  
1   1075.98680      0.47  
2   2108.53434      0.51  
3  10001.80254      0.77  
4   6019.80315      0.68  


In [5]:
# Define the dependent variable (Y)
Y = merged_data['Waste Average']

# Define the independent variables (X)
X = merged_data[['Income Average', 'Average_Education_Level', 'interaction', 'yes_perc']]

# Add a constant to the model (for the intercept)
X = sm.add_constant(X)

# Fit the OLS regression model
model = sm.OLS(Y, X).fit(cov_type='HC3')

# Print the summary of the regression results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:          Waste Average   R-squared:                       0.495
Model:                            OLS   Adj. R-squared:                  0.389
Method:                 Least Squares   F-statistic:                     3.817
Date:                Wed, 27 Nov 2024   Prob (F-statistic):             0.0193
Time:                        12:53:59   Log-Likelihood:                -131.25
No. Observations:                  24   AIC:                             272.5
Df Residuals:                      19   BIC:                             278.4
Df Model:                           4                                         
Covariance Type:                  HC3                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [6]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each predictor
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)


                  Variable        VIF
0                    const  86.940668
1           Income Average  28.993519
2  Average_Education_Level   4.662236
3              interaction  40.041426
4                 yes_perc   2.358811


In [7]:
# Regression with only yes_perc as a predictor
X_simple = sm.add_constant(merged_data[['yes_perc']])
model_simple = sm.OLS(Y, X_simple).fit(cov_type='HC3')
print(model_simple.summary())

                            OLS Regression Results                            
Dep. Variable:          Waste Average   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                 -0.019
Method:                 Least Squares   F-statistic:                    0.3625
Date:                Wed, 27 Nov 2024   Prob (F-statistic):              0.553
Time:                        12:54:17   Log-Likelihood:                -139.13
No. Observations:                  24   AIC:                             282.3
Df Residuals:                      22   BIC:                             284.6
Df Model:                           1                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        348.9923    100.912      3.458      0.0

In [8]:
import numpy as np
merged_data['Log_Waste'] = np.log(merged_data['Waste Average'])
merged_data['Log_yes_perc'] = np.log(merged_data['yes_perc'])

# Regression with log-transformed variables
X_log = sm.add_constant(merged_data[['Log_yes_perc', 'Income Average', 'Average_Education_Level', 'interaction']])
Y_log = merged_data['Log_Waste']
model_log = sm.OLS(Y_log, X_log).fit(cov_type='HC3')
print(model_log.summary())


                            OLS Regression Results                            
Dep. Variable:              Log_Waste   R-squared:                       0.532
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     3.126
Date:                Wed, 27 Nov 2024   Prob (F-statistic):             0.0391
Time:                        12:54:26   Log-Likelihood:                 12.250
No. Observations:                  24   AIC:                            -14.50
Df Residuals:                      19   BIC:                            -8.610
Df Model:                           4                                         
Covariance Type:                  HC3                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [9]:
# Define the dependent variable (log-transformed waste)
Y_log = merged_data['Log_Waste']

# Define the independent variables, excluding yes_perc
X_no_yesperc = sm.add_constant(merged_data[['Income Average', 'Average_Education_Level', 'interaction']])

# Fit the OLS regression model
model_no_yesperc = sm.OLS(Y_log, X_no_yesperc).fit()

# Print the summary of the regression results
print(model_no_yesperc.summary())


                            OLS Regression Results                            
Dep. Variable:              Log_Waste   R-squared:                       0.475
Model:                            OLS   Adj. R-squared:                  0.397
Method:                 Least Squares   F-statistic:                     6.039
Date:                Wed, 27 Nov 2024   Prob (F-statistic):            0.00423
Time:                        12:54:45   Log-Likelihood:                 10.881
No. Observations:                  24   AIC:                            -13.76
Df Residuals:                      20   BIC:                            -9.050
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [10]:
# Define the dependent variable (log-transformed waste)
Y_log = merged_data['Log_Waste']

# Define the independent variables, including yes_perc without log transformation
X_yesperc = sm.add_constant(merged_data[['yes_perc', 'Income Average', 'Average_Education_Level', 'interaction']])

# Fit the OLS regression model
model_yesperc = sm.OLS(Y_log, X_yesperc).fit(cov_type='HC3')

# Print the summary of the regression results
print(model_yesperc.summary())


                            OLS Regression Results                            
Dep. Variable:              Log_Waste   R-squared:                       0.517
Model:                            OLS   Adj. R-squared:                  0.416
Method:                 Least Squares   F-statistic:                     3.012
Date:                Wed, 27 Nov 2024   Prob (F-statistic):             0.0441
Time:                        12:55:16   Log-Likelihood:                 11.880
No. Observations:                  24   AIC:                            -13.76
Df Residuals:                      19   BIC:                            -7.870
Df Model:                           4                                         
Covariance Type:                  HC3                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [11]:
# Merge the inflation data into merged_data
merged_data = pd.merge(merged_data, ave_infl, on='Country', how='left')

# Preview the updated dataset
print("Preview of merged_data with added inflation data:")
print(merged_data.head())

Preview of merged_data with added inflation data:
    Country  Income Average  Waste Average  Average_Education_Level  \
0   Belgium         24451.9     447.059027                   0.3627   
1  Bulgaria          4286.8     400.376196                   0.2510   
2   Czechia          9730.2     443.899034                   0.2167   
3   Denmark         30502.6     607.350966                   0.3279   
4   Germany         23197.7     452.967933                   0.2595   

   interaction  yes_perc  Log_Waste  Log_yes_perc  Average  
0   8868.70413      0.63   6.102691     -0.462035   0.0248  
1   1075.98680      0.47   5.992405     -0.755023   0.0279  
2   2108.53434      0.51   6.095597     -0.673345   0.0417  
3  10001.80254      0.77   6.409107     -0.261365   0.0172  
4   6019.80315      0.68   6.115821     -0.385662   0.0252  


In [15]:
merged_data = merged_data.rename(columns={'Average': 'Average Inflation'})

In [16]:
# Define the dependent variable (log-transformed waste)
Y_log = merged_data['Log_Waste']

# Define the independent variables, including average inflation
X_with_inflation = sm.add_constant(merged_data[['yes_perc', 'Income Average', 'Average_Education_Level', 'interaction', 'Average Inflation']])

# Fit the OLS regression model
model_with_inflation = sm.OLS(Y_log, X_with_inflation).fit()

# Print the summary of the regression results
print(model_with_inflation.summary())


                            OLS Regression Results                            
Dep. Variable:              Log_Waste   R-squared:                       0.519
Model:                            OLS   Adj. R-squared:                  0.385
Method:                 Least Squares   F-statistic:                     3.882
Date:                Wed, 27 Nov 2024   Prob (F-statistic):             0.0146
Time:                        12:58:42   Log-Likelihood:                 11.920
No. Observations:                  24   AIC:                            -11.84
Df Residuals:                      18   BIC:                            -4.773
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [17]:
# Correlation matrix
print(merged_data[['Income Average', 'Average Inflation']].corr())


                   Income Average  Average Inflation
Income Average           1.000000          -0.609649
Average Inflation       -0.609649           1.000000


In [18]:
# Add a squared inflation term
merged_data['inflation_squared'] = merged_data['Average Inflation']**2

# Add it to the regression
X_with_inflation_squared = sm.add_constant(merged_data[['yes_perc', 'Income Average',
                                                        'Average_Education_Level', 'interaction',
                                                        'Average Inflation', 'inflation_squared']])
model_inflation_squared = sm.OLS(Y_log, X_with_inflation_squared).fit()
print(model_inflation_squared.summary())

                            OLS Regression Results                            
Dep. Variable:              Log_Waste   R-squared:                       0.549
Model:                            OLS   Adj. R-squared:                  0.390
Method:                 Least Squares   F-statistic:                     3.454
Date:                Wed, 27 Nov 2024   Prob (F-statistic):             0.0204
Time:                        12:59:33   Log-Likelihood:                 12.707
No. Observations:                  24   AIC:                            -11.41
Df Residuals:                      17   BIC:                            -3.167
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [23]:
# Step 1: Check common countries between the datasets
common_countries = set(merged_data['Country']) & set(urban_perc['Country'])
print(f"Number of countries in common: {len(common_countries)}")

Number of countries in common: 24


In [24]:
# Step 2: Filter urban_perc to retain only countries in merged_data
urban_perc_filtered = urban_perc[urban_perc['Country'].isin(merged_data['Country'])]

# Step 3: Merge the filtered urban_perc dataset into merged_data
merged_data = pd.merge(merged_data, urban_perc_filtered, on='Country', how='left')

# Preview the updated merged_data
print("Preview of merged_data with Urban Percentage:")
print(merged_data.head())

Preview of merged_data with Urban Percentage:
    Country  Income Average  Waste Average  Average_Education_Level  \
0   Belgium         24451.9     447.059027                   0.3627   
1  Bulgaria          4286.8     400.376196                   0.2510   
2   Czechia          9730.2     443.899034                   0.2167   
3   Denmark         30502.6     607.350966                   0.3279   
4   Germany         23197.7     452.967933                   0.2595   

   interaction  yes_perc  Log_Waste  Log_yes_perc  Average Inflation  \
0   8868.70413      0.63   6.102691     -0.462035             0.0248   
1   1075.98680      0.47   5.992405     -0.755023             0.0279   
2   2108.53434      0.51   6.095597     -0.673345             0.0417   
3  10001.80254      0.77   6.409107     -0.261365             0.0172   
4   6019.80315      0.68   6.115821     -0.385662             0.0252   

   inflation_squared  Urban Percentage  
0           0.000615          0.979991  
1           

In [25]:
# Define the dependent variable (log-transformed waste)
Y_log = merged_data['Log_Waste']

# Define the independent variables, including urban_perc as a new control
X_with_urban = sm.add_constant(merged_data[['Income Average', 'yes_perc',
                                            'Average_Education_Level', 'interaction',
                                            'Average Inflation', 'Urban Percentage']])

# Fit the OLS regression model
model_with_urban = sm.OLS(Y_log, X_with_urban).fit()

# Print the summary of the regression results
print(model_with_urban.summary())


                            OLS Regression Results                            
Dep. Variable:              Log_Waste   R-squared:                       0.583
Model:                            OLS   Adj. R-squared:                  0.435
Method:                 Least Squares   F-statistic:                     3.955
Date:                Wed, 27 Nov 2024   Prob (F-statistic):             0.0117
Time:                        13:21:00   Log-Likelihood:                 13.626
No. Observations:                  24   AIC:                            -13.25
Df Residuals:                      17   BIC:                            -5.007
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     