In [15]:
import pandas as pd

education_data=pd.read_csv('education_data.csv', encoding='latin-1',sep=';')
GDP_data=pd.read_csv('GDP_data.csv', encoding='latin-1',sep=';')
health_data=pd.read_csv('health_data.csv', encoding='latin-1',sep=';')
HPI_data=pd.read_csv('HPI_data.csv', encoding='latin-1',sep=';')
income_data=pd.read_csv('income_data.csv', encoding='latin-1',sep=';')
unemployment_data=pd.read_csv('unemployment_data.csv', encoding='latin-1',sep=';')

In [16]:
import warnings
warnings.filterwarnings("ignore")

In [17]:
part1=pd.merge(education_data, GDP_data, on='FIPS code', suffixes=('','_y'))
part2=pd.merge(part1, health_data, on='FIPS code', suffixes=('','_y'))
part3=pd.merge(part2, HPI_data, on='FIPS code', suffixes=('','_y'))
part4=pd.merge(part3, income_data, on='FIPS code', suffixes=('','_y'))
part4=pd.merge(part4, unemployment_data, on='FIPS code', suffixes=('','_y'))

part4.drop(part4.filter(regex='_y$').columns, axis=1, inplace=True)

In [18]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

In [19]:
# wypieprzamy kolumny, ktore sie powtarzaja plus przeszkadzaja w kompilacji kodu

part4 = part4.drop(['State', 'Area name','State Abbreviation','Name','State Abbreviation.1','Name.1','State Abbreviation.2','Name.2','State Abbreviation.3','Name.3','State Abbreviation.4','Name.4','GeoName','Area_name','County'], axis=1)
part4=part4.dropna()

In [33]:
X = part4.drop('Med_HH_Income_Percent_of_State_Total_2020', axis=1)
y = part4['Med_HH_Income_Percent_of_State_Total_2020']
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [21]:
# print((part4.head(10)).to_string(index=False))

## 1. RandomForestRegressor

In [22]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

model = RandomForestRegressor(random_state=0)
model.fit(X, y)

importance = model.feature_importances_
indices = np.argsort(importance)[::-1]

print("Selected features:")
for f in range(10):
    print(importance[indices[f]],end='\t')
    print(X.columns[indices[f]])

Selected features:
0.6639427948574054	Median_Household_Income_2020
0.0271346219503002	Adult smoking raw value 2017
0.022485546580966246	Adult smoking raw value 2020
0.012577261040564619	Poor mental health days raw value 2020
0.008809795114338301	Median household income raw value 2016
0.007516037789665894	2004 HPI Change
0.007476492227683948	Poor physical health days raw value 2020
0.006551182765879246	2002 HPI Change
0.005108172451307501	Uninsured children raw value 2018
0.004977163352188068	Frequent physical distress raw value 2020


## 2. RFE - recursive feature elimination with cross-validation

In [24]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression


lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=10)

rfe.fit(X, y)

print(f"Optimal number of features: {rfe.n_features_}")

selected_features = X.columns[rfe.support_]

print("Selected features:")
for feature in selected_features:
    print(feature)

Optimal number of features: 10
Selected features:
Primary care physicians raw value 2016
Dentists raw value 2016
Other primary care providers raw value 2016
Other primary care providers raw value 2017
Dentists raw value 2018
Other primary care providers raw value 2018
Dentists raw value 2019
Other primary care providers raw value 2019
Dentists raw value 2020
Mental health providers raw value 2020


## 3. Lasso - linear model trained with L1 prior as regularizer

In [26]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

lasso = Lasso(max_iter=15000)
lasso.fit(X, y)

sfm = SelectFromModel(lasso, threshold=0.1)
sfm.fit(X, y)

selected_feat= X.columns[(sfm.get_support())]
print("Selected features:")
print(selected_feat)

Selected features:
Index(['Percent of adults with less than a high school diploma, 1970',
       'Percent of adults with a high school diploma only, 1980',
       'Percent of adults with a bachelor's degree or higher, 2008-12',
       'Percent of adults with a bachelor's degree or higher, 2017-21',
       '2001 HPI Change', '2002 HPI Change', '2004 HPI Change',
       '2005 HPI Change', '2006 HPI Change', 'Unemployment_rate_2013'],
      dtype='object')


## 4. SelectKBest - removes all but the highest scoring features

In [28]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

selector = SelectKBest(score_func=f_regression, k=10)
selector.fit(X_scaled, y)

selected_features = X.columns[selector.get_support()]
print("Selected features:")
print(selected_features)

Selected features:
Index(['Percent of adults with a bachelor's degree or higher, 2000',
       'Percent of adults with a bachelor's degree or higher, 2008-12',
       'Percent of adults with a bachelor's degree or higher, 2017-21',
       'Median household income raw value 2016',
       'Children in poverty raw value 2016', 'Some college raw value 2016',
       'Premature death raw value 2016',
       'Premature age-adjusted mortality raw value 2016',
       'Unemployment_rate_2001', 'Median_Household_Income_2020'],
      dtype='object')


## 5. VarianceThreshold - removes all features whose variance doesn’t meet some threshold

In [34]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_new = sel.fit_transform(X)
X_new_df = pd.DataFrame(data=X_new, columns=X.columns[sel.get_support()])
print(X_new_df.head()) # wyświetla dataset z usunietymi kolumnami

   FIPS code  2003 Rural-urban Continuum Code  2003 Urban Influence Code  \
0     1001.0                              2.0                        2.0   
1     1003.0                              4.0                        5.0   
2     1005.0                              6.0                        6.0   
3     1009.0                              1.0                        1.0   
4     1013.0                              6.0                        6.0   

   2013 Rural-urban Continuum Code  2013 Urban Influence Code  \
0                              2.0                        2.0   
1                              3.0                        2.0   
2                              6.0                        6.0   
3                              1.0                        1.0   
4                              6.0                        6.0   

   Less than a high school diploma, 1970  High school diploma only, 1970  \
0                                 6611.0                          3757.0   


## 6. RidgeCV - ridge regression with built-in cross-validation

In [32]:
from sklearn.linear_model import RidgeCV

ridgecv = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5)
ridgecv.fit(X, y)

print("Selected features:")
counter=0
for coef, feature in sorted(zip(ridgecv.coef_, X.columns), reverse=True):
    if counter<10:
        if coef != 0:
            print("{:.3f}\t{}".format(coef, feature))
            counter=counter+1

Selected features:
37.706	Adult smoking raw value 2016
37.254	Excessive drinking raw value 2020
34.698	Adult smoking raw value 2020
32.514	Adult smoking raw value 2017
24.650	Low birthweight raw value 2016
20.451	Children in poverty raw value 2018
20.389	Uninsured adults raw value 2016
19.768	Uninsured adults raw value 2018
19.356	Frequent physical distress raw value 2017
17.726	Physical inactivity raw value 2016
