# Feature Selection

In [1]:
import pandas as pd
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

In [6]:
hotel_rev = pd.read_csv('./HotelRevHelpfulness.csv')
print(hotel_rev.shape)
hotel_rev.head()

(486, 25)


Unnamed: 0,hotelId,aveHelpfulnessRatioUser,stdevHelpfulnessRatioUser,pcReviewsExceedMinHelpfulnessSupport,numReviewsUser,numReviewsHotel,ratingUser,numberSubRatingsUser,subRatingMeanUser,subRatingStdevUser,...,completeness_2,completeness_3,numberTermsEntry,percentageAlphaCharsEntry,fractionUpperCaseCharsEntry,fractionYouVsIEntry,numberTermsSummaryQuote,percentageAlphaCharsSummaryQuote,fractionUpperCaseCharsSummaryQuote,reviewHelpfulness
0,17420,1.0,0.0,0.666667,3,16,5,4,4.0,0.0,...,0,1,182,0.788474,0.025703,0.5,6,0.815789,0.096774,1
1,1397,0.772487,0.377321,0.5,12,233,5,0,0.0,0.0,...,0,0,158,0.791888,0.012594,0.5,1,1.0,0.083333,1
2,1348,0.715473,0.300437,0.833333,12,302,4,7,3.714286,0.755929,...,0,3,59,0.799639,0.024831,0.333333,4,0.828571,0.034483,0
3,5940,0.52125,0.481675,0.222222,36,6,1,4,1.0,0.0,...,0,0,95,0.782212,0.029155,0.5,2,0.8,0.0625,0
4,38,0.603175,0.246926,1.0,2,271,3,0,0.0,0.0,...,0,0,43,0.805128,0.028662,0.0,1,1.0,0.142857,0


In [7]:
y = hotel_rev.pop('reviewHelpfulness').values
hotel_rev.pop('hotelId')
X = hotel_rev.values
hotel_rev.shape

(486, 23)

## Filter-based Feature Selection
### Feature Scoring - two methods  
1. Chi square statistic
2. Information Gain

In [8]:
chi2_score, pval = chi2(X, y)
chi2_score
# The chi square scores for the 23 features

array([1.17643855e+00, 1.25630861e+00, 2.97935332e+00, 4.99241616e+02,
       2.09687468e+01, 4.63515335e+00, 1.60567118e+01, 2.34205853e+01,
       1.22221503e-01, 2.01609093e-01, 5.11397162e-01, 4.38593706e-02,
       4.29417810e-02, 1.10399662e+01, 7.01736499e+00, 1.50630136e+01,
       1.35095136e+03, 4.73150717e-03, 3.15477433e-03, 1.96131502e+00,
       2.26290946e-01, 5.28034618e-03, 9.95572153e-03])

In [9]:
i_scores = mutual_info_classif(X,y)
i_scores
# The i-gain scores for the 23 features

array([0.02466917, 0.01936157, 0.02568831, 0.0721619 , 0.0352665 ,
       0.        , 0.        , 0.06203968, 0.        , 0.04951721,
       0.0919241 , 0.0454919 , 0.05844492, 0.        , 0.01125881,
       0.03669793, 0.06337678, 0.        , 0.        , 0.02972636,
       0.        , 0.        , 0.02003942])

In [10]:
from scipy import stats
stats.spearmanr(chi2_score, i_scores)

SpearmanrResult(correlation=0.3699115191605428, pvalue=0.0823272587249335)

In [11]:
mi = dict()

for i,j in zip(hotel_rev.columns,i_scores):
    mi[i]=j
    print(i,": ",j)

aveHelpfulnessRatioUser :  0.024669169780794364
stdevHelpfulnessRatioUser :  0.019361573490448425
pcReviewsExceedMinHelpfulnessSupport :  0.025688305598533967
numReviewsUser :  0.07216189792531846
numReviewsHotel :  0.03526649605775245
ratingUser :  0.0
numberSubRatingsUser :  0.0
subRatingMeanUser :  0.06203967966476287
subRatingStdevUser :  0.0
aveRatingUser :  0.049517207623075654
stdevRatingUser :  0.09192410222116343
aveRatingHotel :  0.04549189551656596
stdevRatingHotel :  0.05844491929607343
completeness_1 :  0.0
completeness_2 :  0.011258809851718121
completeness_3 :  0.036697928667075
numberTermsEntry :  0.0633767760182622
percentageAlphaCharsEntry :  0.0
fractionUpperCaseCharsEntry :  0.0
fractionYouVsIEntry :  0.02972636494876868
numberTermsSummaryQuote :  0.0
percentageAlphaCharsSummaryQuote :  0.0
fractionUpperCaseCharsSummaryQuote :  0.02003942068603992


#### Image Segmentation Data

In [None]:
import pandas as pd
seg_data = pd.read_csv('segmentation-all.csv')
print(seg_data.shape)
seg_data.head()

In [None]:
y = seg_data.pop('Class').values
X = seg_data.values

### Feature Selection

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
mnb = GaussianNB()

In [None]:
mi = dict()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)
i_scores = mutual_info_classif(X_train, y_train)

for i,j in zip(seg_data.columns,i_scores):
    mi[i]=j
 
df = pd.DataFrame.from_dict(mi,orient='index',columns=['I-Gain'])
df.sort_values(by=['I-Gain'],ascending=False,inplace=True)
df.head(10)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

n = len(df.index)
rr = range(1,n)
fig, ax = plt.subplots()
ax.bar(df.index, df["I-Gain"], label='I-Gain',width=.35)
ax.set_xticklabels(list(df.index), rotation = 90)
ax.set_xlabel('Features')
ax.set_ylabel('I-Gain')
ax.legend()

plt.show()

## Select *k* Best Features
We rank the features using information gain (well mutual information) and select the _k_ best to build a classifier.  
We iterate through increasing values of *k*.  
`SelectKBest` is a _transform_ that transforms the training data.


In [None]:
acc_scores = []
for kk in range(1, X.shape[1]+1):
    FS_trans = SelectKBest(mutual_info_classif, 
                           k=kk).fit(X_train, y_train)
    X_tR_new = FS_trans.transform(X_train)
    X_tS_new = FS_trans.transform(X_test)
    seg_NB = mnb.fit(X_tR_new, y_train)
    y_dash = seg_NB.predict(X_tS_new)
    acc = accuracy_score(y_test, y_dash)
    acc_scores.append(acc)

df['Accuracy'] = acc_scores
df.head(10)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

n = len(df.index)
rr = range(1,n)
fig, ax = plt.subplots()
ax2 = ax.twinx()
ax.bar(df.index, df["I-Gain"], label='I-Gain',width=.35)
ax2.plot(df.index, df["Accuracy"], color='red', label='Accuracy')
ax.set_xticklabels(list(df.index), rotation = 90)
ax.set_xlabel('Features')
ax.set_ylabel('I-Gain')
ax2.set_ylabel('Accuracy')
ax.legend()

plt.show()

---
## Wrapper
Forward Sequential Search on Image Segmentation data.  
`scikit learn` does not support Wrapper feature selection so we use `MLxtend`.  
http://rasbt.github.io/mlxtend/
So you will probably need to install some libraries:  
`pip install mlxtend`  
`pip install joblib`

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
knn = KNeighborsClassifier(n_neighbors=4)

seg_data = pd.read_csv('segmentation-all.csv')
print(seg_data.shape)
seg_data.head()

In [None]:
y = seg_data.pop('Class').values
X = seg_data.values
feature_names = seg_data.columns

Run forward sequential wrapper search to select 7 features using 10-fold xval.  

In [None]:
sfs_forward = SFS(knn, 
                  k_features=7, 
                  forward=True, 
                  floating=False, 
                  verbose=1,
                  scoring='accuracy',
                  cv=10, n_jobs = -1)

sfs_forward = sfs_forward.fit(X, y, 
                              custom_feature_names=feature_names)

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

fig1 = plot_sfs(sfs_forward.get_metric_dict(), 
                ylabel='Accuracy',
                kind='std_dev')

plt.ylim([0.5, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()
print(sfs_forward.k_feature_names_)