In [35]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #Matplotlib is a plotting library for the Python programming language
from sklearn.model_selection import train_test_split #For Spliting Dataset
#The goal of ensemble methods is to combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
#In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
from sklearn.metrics import accuracy_score


In [36]:

#importing the dataset
data = pd.read_csv('Update (1).csv')

#Before making anything like feature selection,feature extraction and classification, firstly we start with basic data analysis. Lets look at features of data.
data.head()  # head method show only first 5 rows

#. Pandas has a helpful select_dtypes function which we can use to build a new dataframe containing only the object columns.
obj_data = data.select_dtypes(include=['object']).copy()
obj_data.head()   # head method show only first 5 rows

#Input   [Encoding Categorical Values]
X = data.iloc[:,0:18].values 
print("\nX before making numerical: \n",X)


#taking careof categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()

X[:,2]= labelencoder_X.fit_transform(X[:,2])
X[:,3]= labelencoder_X.fit_transform(X[:,3])
X[:,5]= labelencoder_X.fit_transform(X[:,5])
X[:,7]= labelencoder_X.fit_transform(X[:,7])
X[:,8]= labelencoder_X.fit_transform(X[:,8])
X[:,12]= labelencoder_X.fit_transform(X[:,12])
X[:,15]= labelencoder_X.fit_transform(X[:,15])
#Print all the numerical data
print("\nX after making numerical: \n",X,"\n")
#After convert all the data in numerical format the attribute name again needed to given.
df = pd.DataFrame(X, columns = ['id', 'diagnosis', 'Invoice Date', 'Date of birth', 'Invoice No', 'Gender', 'Test Name', 'Age',	'Delivery Date', 'Department', 'Sample', 'Contact number', 'patient name', 'Unit', 'Reference Value', 'Address', 'Test Attribute', 'Result'])



X before making numerical: 
 [[111 'M' '1/1/2018' '22/1/1990' 900 'M' 555 28 '8/1/2018' 909 776 112233
  'akkas' 123 111 'ctg' 7765 1122]
 [112 'B' '2/1/2018' '20/9/1989' 901 'F' 501 29 '9/1/2018' 991 667 990077
  'nusrat' 321 123 'dhaka' 4788 1456]
 [113 'M' '3/1/2018' '1/11/1993' 902 'M' 502 19 '10/1/2018' 992 555
  123456 'kalam' 132 124 'Korimpur' 6754 5532]
 [114 'M' '3/1/2018' '2/11/1990' 903 'M' 503 23 '10/1/2018' 993 777
  880965 'robiul' 120 246 'birampur' 7754 5322]
 [115 'M' '4/1/2018' '22/9/1988' 904 'M' 504 24 '11/1/2018' 994 445
  235765 'jamal' 123 467 'syria' 6754 6543]
 [116 'B' '4/1/2018' '31/12/1982' 905 'F' 505 26 '11/1/2018' 995 336
  345865 'kamal' 564 756 'agrabad' 8534 6433]
 [117 'B' '22/2/2018' '15/9/1993' 906 'M' 506 40 '28/2/2018' 996 365
  349065 'nizam' 546 456 'gec' 7643 7898]
 [118 'B' '25/2/2018' '26/5/1997' 907 'M' 507 41 '28/2/2018' 997 334
  357852 'amin' 789 797 'halisohor' 7543 7946]
 [119 'M' '3/3/2018' '11/2/1978' 908 'M' 508 43 '13/3/2018' 998 

In [37]:
# 1) There is an id that cannot be used for classificaiton 2) Diagnosis is our class label
#Therefore, drop these unnecessary features.
# feature names as a list
col = df.columns       # .columns gives columns names in data 
print(col)

# y includes our labels and x includes our features
y = df.diagnosis
list = ['id','diagnosis']
x = df.drop(list,axis = 1 )
x.head()


Index(['id', 'diagnosis', 'Invoice Date', 'Date of birth', 'Invoice No',
       'Gender', 'Test Name', 'Age', 'Delivery Date', 'Department', 'Sample',
       'Contact number', 'patient name', 'Unit', 'Reference Value', 'Address',
       'Test Attribute', 'Result'],
      dtype='object')


Unnamed: 0,Invoice Date,Date of birth,Invoice No,Gender,Test Name,Age,Delivery Date,Department,Sample,Contact number,patient name,Unit,Reference Value,Address,Test Attribute,Result
0,0,8,900,1,555,8,7,909,776,112233,1,123,111,4,7765,1122
1,4,7,901,0,501,9,8,991,667,990077,10,321,123,5,4788,1456
2,7,0,902,1,502,4,0,992,555,123456,6,132,124,0,6754,5532
3,7,6,903,1,503,5,0,993,777,880965,13,120,246,3,7754,5322
4,9,9,904,1,504,6,1,994,445,235765,5,123,467,13,6754,6543


In [38]:
#1) Feature selection with correlation and random forest classification
#Drop Each value using Domain Knowledge.
drop_list1 = ['Invoice No','Invoice Date','Test Name','Delivery Date','Department','Unit','Reference Value','Test Attribute']
x_1 = x.drop(drop_list1,axis = 1 )        # do not modify x, we will use it later 
#Create a empty list for storing 4 algorithm output.
p = []
#Store the Feature selection with correlation and random forest classification algorithm output in the empty list.
p.append(x_1.columns.tolist())
#head method show only first 5 rows
x_1.head()

#y includes our labels and x includes our features
#Store the y value in data_dia Variable.
data_dia = y
#Store the x value in data Variable.
data = x


# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(x_1, y, test_size=0.3, random_state=42)

#random forest classifier with n_estimators
clf_rf = RandomForestClassifier(random_state=43)      
#fitting is equal to training. Then, after it is trained, the model can be used to make predictions, usually with a .predict() method call.
clr_rf = clf_rf.fit(x_train,y_train)


In [39]:
# 2) Univariate feature selection and random forest classification

#import SelectKBest For Selecting the top k features that have maximum relevance with the targe variable.
from sklearn.feature_selection import SelectKBest
#import chi2 for scoring the feature
from sklearn.feature_selection import chi2
# find best scored features
select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train)
#Print all the selected feature score
print('Score list:', select_feature.scores_)
#Print all the selected feature value
print('Feature list:', x_train.columns)
#Store the Univariate feature selection and random forest classification algorithm output in the list.
p.append(x_train.columns.values.tolist())

scores = {}
#For showing selected feature and there output 
#u gets select feature scores and v gets select feature values
for u, v in zip(select_feature.scores_, x_train.columns.values.tolist()):
#Store select feature scores with select feature values
    scores[v] = u
#Print both, select feature scores and select feature values
scores

Score list: [6.75000000e-01 7.14285714e-01 2.70750000e+00 9.65217160e+01
 2.28528451e+05 2.75625000e+00 1.36290323e-01 1.70545910e+04]
Feature list: Index(['Date of birth', 'Gender', 'Age', 'Sample', 'Contact number',
       'patient name', 'Address', 'Result'],
      dtype='object')


{'Date of birth': 0.675,
 'Gender': 0.7142857142857137,
 'Age': 2.7075000000000022,
 'Sample': 96.52171603677228,
 'Contact number': 228528.45051245467,
 'patient name': 2.75625,
 'Address': 0.13629032258064525,
 'Result': 17054.590964434476}

In [40]:
#3) Recursive feature elimination (RFE) with random forest
#Create counter1 variable and store x columns by dict
counter1 = dict([(key, 0) for key in x.columns])
#import RFE for selected the feature
from sklearn.feature_selection import RFE
# Create the RFE object and rank each pixel
#create a for loop to run the output and store 100 times
for i in range(100):
    #random forest classifier
    clf_rf_1 = RandomForestClassifier()      
    #recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features.That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.
    #estimator = object, step = int or float, optional(default = 1)
    rfe = RFE(estimator=clf_rf_1, n_features_to_select=5, step=1)
    #fitting is equal to training. Then, after it is trained, the model can be used to make predictions, usually with a .predict() method call.
    rfe = rfe.fit(x_train, y_train)
    #Create selected_feature variable and store x_train listed value
    selected_feature = x_train.columns[rfe.support_].values.tolist()
    
    #Create a for loop and run selected_feature values times.
    for f in selected_feature:
        #count how many time each attribute generate output
        counter1[f] = counter1[f] + 1

#Sorted the output in ascending order
s1 = sorted(counter1.items(), key=lambda kv: kv[1], reverse = True)
#Print the sorted result
print(s1)
#Store Recursive feature elimination (RFE) with random forest algorithm output in the list
p.append([x[0] for x in s1[:5]])

[('Contact number', 89), ('patient name', 79), ('Result', 73), ('Age', 72), ('Sample', 58), ('Address', 58), ('Date of birth', 41), ('Gender', 30), ('Invoice Date', 0), ('Invoice No', 0), ('Test Name', 0), ('Delivery Date', 0), ('Department', 0), ('Unit', 0), ('Reference Value', 0), ('Test Attribute', 0)]


In [41]:
#4) Recursive feature elimination with cross validation and random forest classification

#Now we will not only find best features but we also find how many features do we need.
from sklearn.feature_selection import RFECV
#Create counter2 variable and store x columns by dict
counter2 = dict([(key, 0) for key in x.columns])
#create a for loop to run the output and store 100 times
for i in range(100):
    #random forest classifier
    clf_rf_2 = RandomForestClassifier() 
    #estimator = object, step = int or float, optional(default = 1), cv = int, cross-validation or an iterable optional
    #scoring = string, callable or none, optional, default = none 
    rfecv = RFECV(estimator=clf_rf_2, step=1, cv=5,scoring='accuracy')   #5-fold cross-validation
    #fitting is equal to training. Then, after it is trained, the model can be used to make predictions, usually with a .predict() method call.
    rfecv = rfecv.fit(x_train, y_train)
    #Create selected_feature variable and store x_train listed value
    selected_feature = x_train.columns[rfecv.support_].values.tolist()
    #Create a for loop and run selected_feature values times.
    for f in selected_feature:
        #count how many time each attribute generate output
        counter2[f] = counter2[f] + 1

#Sorted the output in descending order
s2 = sorted(counter2.items(), key=lambda kv: kv[1], reverse = True)
#Print the sorted result
print(s2)
#Store Recursive feature elimination with cross validation and random forest classification algorithm output in the list
p.append([x[0] for x in s2[:5]])









[('Contact number', 90), ('patient name', 86), ('Result', 80), ('Address', 78), ('Date of birth', 76), ('Sample', 76), ('Age', 72), ('Gender', 63), ('Invoice Date', 0), ('Invoice No', 0), ('Test Name', 0), ('Delivery Date', 0), ('Department', 0), ('Unit', 0), ('Reference Value', 0), ('Test Attribute', 0)]


In [42]:
#Create a Variable "result" and store the algorithm result 
result = set(p[0])
#Create a for loop and find out the common feature from all algorithm
for s in p[1:]:
    #Update the result depend on common feature
    result.intersection_update(s)
print("Effective Feature: ",result)

Effective Feature:  {'Result', 'Contact number', 'patient name'}


In [43]:
#Create a empty list "output"
output = []
#Create a for loop to showing the result with feature score.
for s in result:
    #Store the result in the empty list
    output.append((s, scores[s]))
#Sorted the output in descending order
output = sorted(output, reverse=True, key=lambda tup: tup[1])
#Showing the result
output

[('Contact number', 228528.45051245467),
 ('Result', 17054.590964434476),
 ('patient name', 2.75625)]

In [44]:
#Create a for loop for showing the output by numbering
for i, v in enumerate(output):
    #Showing the result by numbering
    print(str(i + 1) + ' :' + v[0])

1 :Contact number
2 :Result
3 :patient name
