In [2]:
import pandas as pd
import numpy as np
data = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
features = ['preg', 'plas', 'pres', 'skin','test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv(data, names = features)
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.shape

(768, 9)

In [9]:
## Preparing the Data
data = df.values
X = data[:,0:8]
Y = data[:,8]

In [10]:
## Filter Method
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [12]:
# Feature extraction
chi_best = SelectKBest(score_func=chi2, k=4)
k_best = chi_best.fit(X, Y)

# Summarize scores
np.set_printoptions(precision=3)
print("Summarize scores: \n", k_best.scores_)

k_features = k_best.transform(X)
# Summarize selected features
print("Summarize selected features: \n",k_features[0:5,:])

Summarize scores: 
 [ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]
Summarize selected features: 
 [[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


# Wrapper Method

### Recursive Feature Elimination (RFE)

In [13]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [15]:
# Feature extraction
model_lr = LogisticRegression()
recur_fe = RFE(model_lr, n_features_to_select=3)
Feature = recur_fe.fit(X, Y)
print("Number of Features: %s" % (Feature.n_features_))
print("Selected Features are: %s"%(Feature.support_))
print("Feature Ranking is as follows: %s"%(Feature.ranking_))

Number of Features: 3
Selected Features are: [ True False False False False  True  True False]
Feature Ranking is as follows: [1 2 4 5 6 1 1 3]


#### Ridge Regression / L2 Regularization

In [16]:
from sklearn.linear_model import Ridge

In [17]:
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X,Y)

Ridge()

In [18]:
# A helper function for printing the coefficients

def print_coefs(coef, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coef))]
    lst = zip(coef, names)
    if sort:
        lst = sorted(lst, key = lambda x: -np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coefs, 3), name) for  coefs, name in lst)

In [19]:
print("Ridge model:", print_coefs(ridge_reg.coef_))

Ridge model: 0.021 * X0 + 0.006 * X1 + -0.002 * X2 + 0.0 * X3 + -0.0 * X4 + 0.013 * X5 + 0.145 * X6 + 0.003 * X7


### Feature Splitting in Feature Engineering

In [20]:
df = pd.read_csv('data/football_deaths.csv')

In [27]:
df.tail()

Unnamed: 0,row_id,incident_date,player_name,player_country,team_country,player_age,player_team_name,incident_description,heart_related,cardiac_related,collapsed,lightning,collision
224,225,2021-09-04,Jens,Belgium,Netherlands,27.0,FCC Filosoof,"Drama on Dutch football field, amateur player ...",0,0,1,0,0
225,226,2021-09-25,Guillermo,Venezuela,Venezuela,31.0,Camaguán FC,In the quarterfinals of the third division tou...,0,1,1,0,0
226,227,2021-10-01,Bruno,Germany,Germany,15.0,FC An der Fahner Höhe,Young goalkeeper and amateur angler Bruno Stei...,0,0,0,0,0
227,228,2021-10-08,Benoît,France,France,49.0,SC Massay,With minutes to go before the end of the game...,0,0,1,0,0
228,229,2021-10-15,Christophe,France,France,54.0,AS Saint Yves,"Christophe Ramassamy, a -year-old footballer, ...",1,0,1,0,0


In [25]:
import pandas as pd
import numpy as np
df['player_name'] = df.player_name.str.split(" ").map(lambda x: x[0])

In [26]:
df['player_name']

0         William
1           James
2            John
3          Walter
4           Teddy
          ...    
224          Jens
225     Guillermo
226         Bruno
227        Benoît
228    Christophe
Name: player_name, Length: 229, dtype: object