In [3]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score

#importing the dataset
data = pd.read_csv('Update (1).csv')

#Before making anything like feature selection,feature extraction and classification, firstly we start with basic data analysis. Lets look at features of data.
data.head()  # head method show only first 5 rows

#. Pandas has a helpful select_dtypes function which we can use to build a new dataframe containing only the object columns.
obj_data = data.select_dtypes(include=['object']).copy()
obj_data.head()

#Input   [Encoding Categorical Values]
X = data.iloc[:,0:18].values 
print("\nX before making numerical: \n",X)


#taking careof categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:,2]= labelencoder_X.fit_transform(X[:,2])
X[:,3]= labelencoder_X.fit_transform(X[:,3])
X[:,5]= labelencoder_X.fit_transform(X[:,5])
X[:,7]= labelencoder_X.fit_transform(X[:,7])
X[:,8]= labelencoder_X.fit_transform(X[:,8])
X[:,12]= labelencoder_X.fit_transform(X[:,12])
X[:,15]= labelencoder_X.fit_transform(X[:,15])
print("\nX after making numerical: \n",X,"\n")

df = pd.DataFrame(X, columns = ['id', 'diagnosis', 'Invoice Date', 'Date of birth', 'Invoice No', 'Gender', 'Test Name', 'Age',	'Delivery Date', 'Department', 'Sample', 'Contact number', 'patient name', 'Unit', 'Reference Value', 'Address', 'Test Attribute', 'Result'])


# 1) There is an id that cannot be used for classificaiton 2) Diagnosis is our class label)
#Therefore, drop these unnecessary features.
# feature names as a list
col = df.columns       # .columns gives columns names in data 
print(col)

# y includes our labels and x includes our features
y = df.diagnosis
list = ['id','diagnosis']
x = df.drop(list,axis = 1 )
x.head()


#1) Feature selection with correlation and random forest classification

drop_list1 = ['Invoice No','Invoice Date','Test Name','Delivery Date','Department','Unit','Reference Value','Test Attribute']
x_1 = x.drop(drop_list1,axis = 1 )        # do not modify x, we will use it later 
p = []
p.append(x_1.columns.tolist())
x_1.head()



data_dia = y
data = x
data_n_2 = (data - data.mean()) / (data.std())              # standardization
data = pd.concat([y,data_n_2.iloc[:,0:18]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')


# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(x_1, y, test_size=0.3, random_state=42)

#random forest classifier with n_estimators
clf_rf = RandomForestClassifier(random_state=43)      
clr_rf = clf_rf.fit(x_train,y_train)



#2) Univariate feature selection and random forest classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# find best scored features
select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train)

print('Score list:', select_feature.scores_)
print('Feature list:', x_train.columns)
p.append(x_train.columns.values.tolist())

scores = {}
for u, v in zip(select_feature.scores_, x_train.columns.values.tolist()):
    scores[v] = u
scores




#3) Recursive feature elimination (RFE) with random forest

from sklearn.feature_selection import RFE
# Create the RFE object and rank each pixel
clf_rf_1 = RandomForestClassifier()      
rfe = RFE(estimator=clf_rf_1, n_features_to_select=5, step=1)
rfe = rfe.fit(x_train, y_train)


print('Chosen best 5 feature by rfe:',x_train.columns[rfe.support_])


p.append(x_train.columns[rfe.support_].values.tolist())




#4) Recursive feature elimination with cross validation and random forest classification

#Now we will not only find best features but we also find how many features do we need.
from sklearn.feature_selection import RFECV

clf_rf_2 = RandomForestClassifier() 
rfecv = RFECV(estimator=clf_rf_2, step=1, cv=5,scoring='accuracy')   #5-fold cross-validation
rfecv = rfecv.fit(x_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train.columns[rfecv.support_])

p.append(x_train.columns[rfecv.support_].values.tolist())


p



result = set(p[0])
for s in p[1:]:
    result.intersection_update(s)
print("Effective Feature: ",result)

output = []
for s in result:
    output.append((s, scores[s]))
output = sorted(output, reverse=True, key=lambda tup: tup[1])
output

for i, v in enumerate(output):
    print(str(i + 1) + ' :' + v[0])


X before making numerical: 
 [[111 'M' '1/1/2018' '22/1/1990' 900 'M' 555 28 '8/1/2018' 909 776 112233
  'akkas' 123 111 'ctg' 7765 1122]
 [112 'B' '2/1/2018' '20/9/1989' 901 'F' 501 29 '9/1/2018' 991 667 990077
  'nusrat' 321 123 'dhaka' 4788 1456]
 [113 'M' '3/1/2018' '1/11/1993' 902 'M' 502 19 '10/1/2018' 992 555 123456
  'kalam' 132 124 'Korimpur' 6754 5532]
 [114 'M' '3/1/2018' '2/11/1990' 903 'M' 503 23 '10/1/2018' 993 777 880965
  'robiul' 120 246 'birampur' 7754 5322]
 [115 'M' '4/1/2018' '22/9/1988' 904 'M' 504 24 '11/1/2018' 994 445 235765
  'jamal' 123 467 'syria' 6754 6543]
 [116 'B' '4/1/2018' '31/12/1982' 905 'F' 505 26 '11/1/2018' 995 336 345865
  'kamal' 564 756 'agrabad' 8534 6433]
 [117 'B' '22/2/2018' '15/9/1993' 906 'M' 506 40 '28/2/2018' 996 365 349065
  'nizam' 546 456 'gec' 7643 7898]
 [118 'B' '25/2/2018' '26/5/1997' 907 'M' 507 41 '28/2/2018' 997 334 357852
  'amin' 789 797 'halisohor' 7543 7946]
 [119 'M' '3/3/2018' '11/2/1978' 908 'M' 508 43 '13/3/2018' 998 



Optimal number of features : 8
Best features : Index(['Date of birth', 'Gender', 'Age', 'Sample', 'Contact number',
       'patient name', 'Address', 'Result'],
      dtype='object')
Effective Feature:  {'patient name', 'Gender', 'Result', 'Sample', 'Contact number'}
1 :Contact number
2 :Result
3 :Sample
4 :patient name
5 :Gender
