In [None]:
## IMPORT LIBRARIES

In [None]:
import os
import pandas as pd
#importing libraries
from sklearn.datasets import load_boston
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from scipy import stats

## Reading Csv
There are three files on which it must be performed the Feature Subset Selection, I am sorry but you should run and change manually the name, I tried to automize all the process but some libraries used for feature selection are not so stable and you will see that sometimes you have to restart the kernel and run only that specific part. I will leave the name the three files used below. For the file-level csv the process is automatic as you can see at the end of this notebook

In [None]:
#Neo4j-Unified_class.csv
#mct-1.7b1-Unified_class.csv
#Elasticsearch-0.90.11-Unified_class.csv

In [None]:
a = pd.read_csv('../data/mct-1.7b1-Unified_class.csv')
a.columns

In [None]:
pp = ['ID', 'Type', 'Name', 'LongName', 'Parent', 'Component', 'Path', 'Line',
       'Column', 'EndLine', 'EndColumn'] #personal data that is ignored in the lstm modeling
fea = ['CC', 'CCL', 'CCO', 'CI', 'CLC',
       'CLLC', 'LDC', 'LLDC', 'LCOM5', 'NL', 'NLE', 'WMC', 'CBO', 'CBOI',
       'NII', 'NOI', 'RFC', 'AD', 'CD', 'CLOC', 'DLOC', 'PDA', 'PUA', 'TCD',
       'TCLOC', 'DIT', 'NOA', 'NOD', 'NOP', 'LLOC', 'LOC', 'NA', 'NG',
       'NLA', 'NLG', 'NLM', 'NLPA', 'NLPM', 'NLS', 'NM', 'NOS', 'NPA', 'NPM',
       'NS', 'TLLOC', 'TLOC', 'TNA', 'TNG', 'TNLA', 'TNLG', 'TNLM', 'TNLPA',
       'TNLPM', 'TNLS', 'TNM', 'TNOS', 'TNPA', 'TNPM', 'TNS', 'bug'] #features on which FSS is performed
target = ['NOC']

In [None]:
#just see how many missing values there are in the df
(a.isna().sum()/len(a)).sort_values(ascending = False) 

### Descriptive Statistics

In [None]:
a[fea].describe().T

## Normalize Target

In [None]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data)) *100

In [None]:
a.NOC = NormalizeData(a.NOC)
tgt = (a['NOC']>0)*1
tgt.value_counts() #tgt represent the dependent variable (target)

## Clean and save the Original dataset

In [None]:
a[fea]

In [None]:
totcols = [*fea, *target]
totdf = a[totcols]
totdf.to_csv('./Datasets/OriginalDataset_MCT_class.csv', sep =';', index = False)
pd.read_csv('./Datasets/OriginalDataset_MCT_class.csv', sep =';')

## Features Subset Selection

Feature selection is a process where you automatically select those features in your data that contribute most to the prediction variable or output in which you are interested.

Having irrelevant features in your data can decrease the accuracy of many models, especially linear algorithms like linear and logistic regression.

Three benefits of performing feature selection before modeling your data are:

- Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.
- Improves Accuracy: Less misleading data means modeling accuracy improves.
- Reduces Training Time: Less data means that algorithms train faster.

We have to choose the optimal subset using the following algorithms:
- Linear Correlation
- Rank Correlation
- One R
- Relief (Filter Method)
- Consistency 

### Linear Correlation

In [None]:
#Linear Correlation using Pearson Correlation
cor = a[fea].corr()
cor

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(cor, annot=False, cmap=plt.cm.Reds)
plt.show()

In [None]:
cols = np.full((cor.shape[0],), True, dtype=bool)
for i in range(cor.shape[0]):
    for j in range(i+1, cor.shape[0]):
        if cor.iloc[i,j] >= 0.55:
            if cols[j]:
                cols[j] = False
selected_columns_lcorr = a[fea].columns[cols]
data = a[selected_columns_lcorr]

In [None]:
len(selected_columns_lcorr)

In [None]:
selected_columns_lcorr

In [None]:
target = ['NOC']

In [None]:
totcols = [*selected_columns_lcorr, *target]
totdf = a[totcols]
totdf.to_csv('./Datasets/LinearCorr_MCT_class.csv', sep =';', index = False)
pd.read_csv('./Datasets/LinearCorr_MCT_class.csv', sep =';') #just to check the current format

### Rank Correlation

In [None]:
#Rank Correlation using Kendall Correlation: 
kcorr = a[fea].corr(method = 'kendall')

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(kcorr, annot=False, cmap=plt.cm.Reds)
plt.show()

In [None]:
cols_r = np.full((kcorr.shape[0],), True, dtype=bool)
for i in range(kcorr.shape[0]):
    for j in range(i+1, kcorr.shape[0]):
        if abs(kcorr.iloc[i,j]) >= 0.55: ## 
            if cols_r[j]:
                cols_r[j] = False
selected_columns_rankcorr = a[fea].columns[cols_r]
data = a[selected_columns_rankcorr]

In [None]:
selected_columns_rankcorr 

In [None]:
len(selected_columns_rankcorr)

In [None]:
totcols = [*selected_columns_rankcorr, *target]
totdf = a[totcols]
totdf.to_csv('./Datasets/RankCorr_NEO4j_class.csv', sep =';', index = False)
pd.read_csv('./Datasets/RankCorr_NEO4j_class.csv', sep =';') #just to check the current format

### OneR
"OneR" stands for One Rule (by Robert Holte [1]), which is a classic algorithm for supervised learning. Note that this algorithm is not known for its good prediction performance; thus, it is rather recommended for teaching purposes and for lower-bound performance baselines in real-world applications.

The name "OneRule" can be a bit misleading, because it is technically about "one feature" and not about "one rule." I.e., OneR returns a feature for which one or more decision rules are defined. Essentially, as a simple classifier, it finds exactly one feature (and one or more feature values for that feature) to classify data instances.

The basic procedure is as follows:

For each feature among all features (columns) in the dataset:
For each feature value for the given feature:
- Obtain the training examples with that feature value.
- Obtain the class labels (and class label counts) corresponding to the training examples identified in the previous step.

- Regard the class label with the highest frequency (count) as the majority class.
- Record the number of errors as the number of training examples that have the given feature value but are not the majority class.
- Compute the error of the feature by summing the errors for all possible feature values for that feature.
- Return the best feature, which is defined as the feature with the lowest error.

source : http://rasbt.github.io/mlxtend/user_guide/classifier/OneRClassifier/

In [None]:
# xgboost for feature importance on a classification problem
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from matplotlib import pyplot

In [None]:
selected_columns_rankcorr 

In [None]:
# define the model
model = XGBClassifier()
# fit the model
model.fit(a[fea].values, a['NOC'].values)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.figure(figsize = (19,9))
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
bb = {}

for i in range(0,len(importance)):
    bb[a[fea].columns[i]] = (importance[i])

In [None]:
sort_orders = sorted(bb.items(), key=lambda x: x[1], reverse=True)
oneR_fea = []
for i in sort_orders:
    print(i[0])
    oneR_fea.append(i[0])

In [None]:
sort_orders

In [None]:
oneR_fea = oneR_fea[0:15]
oneR_fea

In [None]:
target = ['NOC']

In [None]:
totcols = [*oneR_fea, *target]
len(totcols)

In [None]:
totcols = [*oneR_fea, *target]
totdf = a[totcols]
totdf.to_csv('./Datasets/OneR_NEO4j_class.csv', sep =';', index = False)
pd.read_csv('./Datasets/OneR_NEO4j_class.csv', sep =';') #just to check the current format

### Relief
Relief is an algorithm developed by Kira and Rendell in 1992 that takes a filter-method approach to feature selection that is notably sensitive to feature interactions.
It was originally designed for application to binary classification problems with discrete or numerical features. Relief calculates a feature score for each feature which can then be applied to rank and select top scoring features for feature selection. Alternatively, these scores may be applied as feature weights to guide downstream modeling. Relief feature scoring is based on the identification of feature value differences between nearest neighbor instance pairs.
https://www.sciencedirect.com/science/article/pii/S1532046418301400


Critics = Relief (Kira & Rendell, 1992) does not use a feature set evaluation function, and it does not even perform a search in the feature set space

In [None]:
#!pip install sklearn_relief

In [None]:
import sklearn_relief as sr
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
relief_vars = []
r = sr.RReliefF(n_features = 15)
ff = r.fit_transform(a[fea].values, a['NOC'].values)
relief_df = pd.DataFrame(ff)

In [None]:
relief_df

In [None]:
relief_vars=[]
for k in range(relief_df.shape[1]):
    ovo = a.eq(relief_df.iloc[:, k], axis=0).sum(axis=0)/len(a)
    c = ovo[ovo==1].idxmax() 
    relief_vars.append(c) 
    #if you got an error about the empty sequence, restart the kernel and run this method as first, it will work
    #or simpy rerun 

In [None]:
len(relief_vars)

In [None]:
target = ['NOC']
totcols = [*relief_vars, *target]
totdf = a[totcols]
totdf.to_csv('./Datasets/Relief_NEO4J_class.csv', sep =';', index = False)
pd.read_csv('./Datasets/Relief_NEO4J_class.csv', sep =';') #just to check the current format

### Consistency

Fuzzy Rough Feature Selection (FRFS) greedily selects features that induce the greatest increase in the size of the positive region, until it matches the size of the positive region with all features, or until the required number of features is selected.

The positive region is defined as the union of the lower approximations of the decision classes in X. Its size is the sum of its membership values.

The similarity relation equation M1 for a given subset of attributes B is obtained by aggregating with a t-norm the per-attribute similarities equation M2 associated with the attributes a in B. These are in turn defined, for any equation M3, as the complement of the difference between the attribute values equation M4 and equation M5 after rescaling by the sample standard deviation equation M6 (1).

equation M7
Paper: https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.645.151&rep=rep1&type=pdf

Library: https://github.com/oulenz/fuzzy-rough-learn

In [None]:
#!pip install fuzzy-rough-learn

In [None]:
from frlearn.base import select_class
from frlearn.feature_preprocessors import FRFS

In [None]:
len(a)

In [None]:
df_p = a.head(3000) #reduce df's dimension because it is computationally expensive
X_orig = df_p[fea].values
y = df_p['NOC'].values
preprocessor = FRFS(n_features=15)
model = preprocessor(X_orig, y)
X = model(X_orig)
cons_df = pd.DataFrame(X)
cons_vars=[]
for k in range(cons_df.shape[1]):
    ovo = df_p[fea].eq(cons_df.iloc[:, k], axis=0).sum(axis=0)/len(df_p[fea])
    c = ovo[ovo==1].idxmax()
    cons_vars.append(c)

In [None]:
cons_vars

In [None]:
target = ['NOC']

In [None]:
totcols = [*cons_vars, *target]
totdf = a[totcols]
totdf.to_csv('./Datasets/Consistency_NEO4j_class.csv', sep =';', index = False)
pd.read_csv('./Datasets/Consistency_NEO4j_class.csv', sep =';') #just to check the current format

## Processing the file-level csv
For this kind of file that presents less than 10 variables, even if some are repeated, doesn't make sense run a Feature Subset Selection. So I clean and fix the dataset, in order to use it directly in the next notebook

In [None]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data)) *100

In [None]:
for file in os.listdir('../data/'):
    if file.endswith('file.csv'):
        print('Processing the file:  ',file)
        df=pd.read_csv('../data/'+file)
        fea = ['McCC', 'CLOC', 'PDA', 'PUA',
               'LLOC', 'LOC', 'Number of previous fixes',
               'Number of developer commits', 'Number of committers',
               'bug']
        target = ['Number of previous modifications']
        # Target normalization
        df['Number of previous modifications'] = NormalizeData(df['Number of previous modifications'])
        totcols = [*fea, *target]
        totdf = df[totcols]
        print('DONE')
        totdf.to_csv('./Datasets/OriginalDataset_' + file, sep =';', index = False)

In [None]:
#just a check if it works
pd.read_csv('./Datasets/OriginalDataset_Elasticsearch-0.90.11-Unified_file.csv', sep =';')