# Linear Regression with Feature Selection

#### X1

In [1]:
# ──────────────────────────────────────────────────────────────
# Linear‑regression model with greedy forward feature selection
# -----------------------------------------------------------------
# 1)  Load the four CSV files saved earlier (same folder as notebook)
# 2)  Greedy forward‑stepwise selection:
#       • start with no predictors
#       • at each step add the feature that yields the largest *train* R² gain
#       • stop when no remaining feature improves R²
# 3)  Fit the final LinearRegression on the chosen subset
# 4)  Report selected features + train / test R²
# -----------------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# 1️⃣  Load train / test splits ------------------------------------------------
X1_train = pd.read_csv('X1_train.csv')
X1_test  = pd.read_csv('X1_test.csv')
y_train = pd.read_csv('y_train.csv')['gini_next']
y_test  = pd.read_csv('y_test.csv')['gini_next']

# 2️⃣  Greedy forward feature selection ---------------------------------------
remaining = list(X1_train.columns)
selected  = []
best_R2   = -np.inf

while remaining:
    best_feat = None
    best_score = best_R2
    
    # try adding each remaining feature
    for feat in remaining:
        trial_feats = selected + [feat]
        model = LinearRegression().fit(X1_train[trial_feats], y_train)
        score = r2_score(y_train, model.predict(X1_train[trial_feats]))
        if score > best_score + 1e-4:         # tiny tolerance to avoid ties
            best_score = score
            best_feat  = feat
    
    # keep the feature only if it improves R²
    if best_feat:
        selected.append(best_feat)
        remaining.remove(best_feat)
        best_R2 = best_score
    else:
        break  # no further improvement

# 3️⃣  Fit final model on the chosen subset ------------------------------------
final_model = LinearRegression().fit(X1_train[selected], y_train)

train_R2 = r2_score(y_train, final_model.predict(X1_train[selected]))
test_R2  = r2_score(y_test,  final_model.predict(X1_test[selected]))

# 4️⃣  Results -----------------------------------------------------------------
print("Selected features (in order added):")
for i, feat in enumerate(selected, 1):
    print(f"{i:2d}. {feat}")

print(f"\nFinal train R²: {train_R2:.3f}")
print(f"Final test  R²: {test_R2:.3f}")


Selected features (in order added):
 1. intentional_homicides
 2. expense%
 3. forest_land%
 4. individuals_using_internet%
 5. death_rate
 6. GDP_current_US
 7. research_and_development_expenditure%
 8. access_to_electricity%
 9. agricultural_land%
10. land_area
11. voice_and_accountability_estimate
12. military_expenditure%
13. real_interest_rate
14. inflation_annual%
15. population_density
16. government_expenditure_on_education%
17. government_health_expenditure%
18. CO2_emisions
19. rural_population
20. other_greenhouse_emisions
21. life_expectancy_at_birth
22. population
23. renewvable_energy_consumption%
24. birth_rate
25. political_stability_estimate
26. regulatory_quality_estimate
27. control_of_corruption_estimate
28. rule_of_law_estimate
29. central_goverment_debt%
30. tax_revenue%
31. trade_in_services%

Final train R²: 0.637
Final test  R²: 0.400


In [2]:
X1_train.shape

(813, 32)

#### X2

In [3]:
# 1️⃣  Load train / test splits ------------------------------------------------
X2_train = pd.read_csv('X2_train.csv')
X2_test  = pd.read_csv('X2_test.csv')
y_train = pd.read_csv('y_train.csv')['gini_next']
y_test  = pd.read_csv('y_test.csv')['gini_next']

# 2️⃣  Greedy forward feature selection ---------------------------------------
remaining = list(X2_train.columns)
selected  = []
best_R2   = -np.inf

while remaining:
    best_feat = None
    best_score = best_R2
    
    # try adding each remaining feature
    for feat in remaining:
        trial_feats = selected + [feat]
        model = LinearRegression().fit(X2_train[trial_feats], y_train)
        score = r2_score(y_train, model.predict(X2_train[trial_feats]))
        if score > best_score + 1e-4:         # tiny tolerance to avoid ties
            best_score = score
            best_feat  = feat
    
    # keep the feature only if it improves R²
    if best_feat:
        selected.append(best_feat)
        remaining.remove(best_feat)
        best_R2 = best_score
    else:
        break  # no further improvement

# 3️⃣  Fit final model on the chosen subset ------------------------------------
final_model = LinearRegression().fit(X2_train[selected], y_train)

train_R2 = r2_score(y_train, final_model.predict(X2_train[selected]))
test_R2  = r2_score(y_test,  final_model.predict(X2_test[selected]))

# 4️⃣  Results -----------------------------------------------------------------
print("Selected features (in order added):")
for i, feat in enumerate(selected, 1):
    print(f"{i:2d}. {feat}")

print(f"\nFinal train R²: {train_R2:.3f}")
print(f"Final test  R²: {test_R2:.3f}")


Selected features (in order added):
 1. intentional_homicides_rollmean
 2. forest_land%_rollstd
 3. expense%_rollmean
 4. intentional_homicides_rollstd
 5. individuals_using_internet%_rollmean
 6. forest_land%_rollmean
 7. death_rate
 8. GDP_current_US_rollmean
 9. research_and_development_expenditure%_rollmean
10. access_to_electricity%
11. voice_and_accountability_estimate_rollmean
12. real_interest_rate
13. military_expenditure%_rollmean
14. GDP_current_US_rollstd
15. population_rollstd
16. agricultural_land%_rollmean
17. inflation_annual%_rollstd
18. other_greenhouse_emisions
19. CO2_emisions
20. birth_rate_rollstd
21. political_stability_estimate_rollmean
22. rule_of_law_estimate_rollstd
23. political_stability_estimate_rollstd
24. central_goverment_debt%
25. life_expectancy_at_birth_rollstd
26. government_health_expenditure%
27. life_expectancy_at_birth_rollmean
28. government_expenditure_on_education%
29. birth_rate
30. birth_rate_rollmean
31. life_expectancy_at_birth
32. death_

In [4]:
X2_train.shape

(813, 96)

#### X3

In [5]:
# 1️⃣  Load train / test splits ------------------------------------------------
X3_train = pd.read_csv('X3_train.csv')
X3_test  = pd.read_csv('X3_test.csv')
y_train = pd.read_csv('y_train.csv')['gini_next']
y_test  = pd.read_csv('y_test.csv')['gini_next']

# 2️⃣  Greedy forward feature selection ---------------------------------------
remaining = list(X3_train.columns)
selected  = []
best_R2   = -np.inf

while remaining:
    best_feat = None
    best_score = best_R2
    
    # try adding each remaining feature
    for feat in remaining:
        trial_feats = selected + [feat]
        model = LinearRegression().fit(X3_train[trial_feats], y_train)
        score = r2_score(y_train, model.predict(X3_train[trial_feats]))
        if score > best_score + 1e-4:         # tiny tolerance to avoid ties
            best_score = score
            best_feat  = feat
    
    # keep the feature only if it improves R²
    if best_feat:
        selected.append(best_feat)
        remaining.remove(best_feat)
        best_R2 = best_score
    else:
        break  # no further improvement

# 3️⃣  Fit final model on the chosen subset ------------------------------------
final_model = LinearRegression().fit(X3_train[selected], y_train)

train_R2 = r2_score(y_train, final_model.predict(X3_train[selected]))
test_R2  = r2_score(y_test,  final_model.predict(X3_test[selected]))

# 4️⃣  Results -----------------------------------------------------------------
print("Selected features (in order added):")
for i, feat in enumerate(selected, 1):
    print(f"{i:2d}. {feat}")

print(f"\nFinal train R²: {train_R2:.3f}")
print(f"Final test  R²: {test_R2:.3f}")


Selected features (in order added):
 1. intentional_homicides_lag4
 2. birth_rate_lag4
 3. forest_land%_lag4
 4. other_greenhouse_emisions
 5. research_and_development_expenditure%_lag4
 6. voice_and_accountability_estimate_lag4
 7. expense%_lag4
 8. rural_population
 9. death_rate_lag4
10. individuals_using_internet%_lag4
11. political_stability_estimate_lag4
12. access_to_electricity%_lag1
13. agricultural_land%_lag4
14. CO2_emisions_lag1
15. control_of_corruption_estimate_lag4
16. regulatory_quality_estimate
17. government_expenditure_on_education%
18. military_expenditure%_lag2
19. population_lag4
20. life_expectancy_at_birth_lag4
21. CO2_emisions_lag4
22. birth_rate
23. government_health_expenditure%
24. real_interest_rate
25. inflation_annual%
26. forest_land%_lag2
27. tax_revenue%_lag4
28. access_to_electricity%_lag2
29. life_expectancy_at_birth
30. death_rate
31. expense%_lag1
32. government_health_expenditure%_lag2
33. inflation_annual%_lag2
34. population_density_lag2
35. pop

In [6]:
X3_train.shape

(813, 160)