In [3]:
import pandas as pd

masses_data = pd.read_csv('mammographic_masses.data.txt')
masses_data.head()

Unnamed: 0,5,67,3,5.1,3.1,1
0,4,43,1,1,?,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0
3,5,74,1,5,?,1
4,4,65,1,?,3,0


In [4]:
masses_data = pd.read_csv('mammographic_masses.data.txt', na_values = ['?'], names = ['BI-RADS', 'Age', 'Shape', 'Margin', 
                                                                                     'Density', 'severity'])
masses_data.head()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [5]:
masses_data.loc[(masses_data['Age'].isnull()) |
              (masses_data['Shape'].isnull()) |
              (masses_data['Margin'].isnull()) |
              (masses_data['Density'].isnull())]

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,severity
1,4.0,43.0,1.0,1.0,,1
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0
...,...,...,...,...,...,...
778,4.0,60.0,,4.0,3.0,0
819,4.0,35.0,3.0,,2.0,0
824,6.0,40.0,,3.0,4.0,1
884,5.0,,4.0,4.0,3.0,1


In [6]:
masses_data.dropna(inplace = True)
masses_data.head()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,severity
0,5.0,67.0,3.0,5.0,3.0,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
8,5.0,57.0,1.0,5.0,3.0,1
10,5.0,76.0,1.0,4.0,3.0,1


In [7]:
masses_data.describe()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [8]:
all_features = masses_data[['Age', 'Shape', 'Margin','Density']].values
all_classes = masses_data['severity'].values

features = ['Age', 'Shape', 'Margin', 'Density']

all_features

array([[67.,  3.,  5.,  3.],
       [58.,  4.,  5.,  3.],
       [28.,  1.,  1.,  3.],
       ...,
       [64.,  4.,  5.,  3.],
       [66.,  4.,  5.,  3.],
       [62.,  3.,  3.,  3.]])

In [9]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
all_features_scaled = scaler.fit_transform(all_features)
all_features_scaled

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ...,
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

In [10]:
# Naive Bayes (MultinomialNB) with train_test)split#


scaler_2 = preprocessing.MinMaxScaler()
all_features_min_max = scaler_2.fit_transform(all_features)

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(1234)

X_train, X_test, y_train, y_test = train_test_split(all_features_min_max, all_classes, test_size = 0.25, random_state=1)

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
nb = MultinomialNB()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)
ac = accuracy_score(y_test, predictions)
print(ac)

0.7548076923076923


In [13]:
# Naive Bayes (MultinomialNB) with K-Fold Validation(K = 10)#

from sklearn.model_selection import cross_val_score
model = MultinomialNB()
scores = cross_val_score(model, all_features_min_max, all_classes, cv =10)
scores.mean()

0.7855421686746988

In [14]:
# SVC with linear kernel K-Fold Validation(K = 10)#

from sklearn import svm
from sklearn.model_selection import cross_val_score

model = svm.SVC(kernel = 'linear', C=1.0)

scores = cross_val_score(model, all_features_scaled, all_classes, cv = 10)
scores.mean()

0.7975903614457832

In [15]:
# SVC with linear kernel train_test_split#
# Here all_features and all_features_scaled produce similar results#

from sklearn import svm
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(all_features_scaled, all_classes, test_size = 0.25, random_state=1)

model = svm.SVC(kernel = 'linear', C=1.0).fit(X_train, y_train)
predictions = model.predict(X_test)
ac = accuracy_score(y_test, predictions)
print(ac)

0.7692307692307693


In [16]:
# SVC with rbf kernel K-Fold Validation(K = 10)#

from sklearn import svm
from sklearn.model_selection import cross_val_score

model = svm.SVC(kernel = 'rbf', C=1.0)

scores = cross_val_score(model, all_features_scaled, all_classes, cv = 10)
scores.mean()

0.8012048192771084

In [17]:
# SVC with sigmoid kernel K-Fold Validation(K =10)#

from sklearn import svm
from sklearn.model_selection import cross_val_score

model = svm.SVC(kernel = 'sigmoid', C=1.0)

scores = cross_val_score(model, all_features_scaled, all_classes, cv = 10)
scores.mean()

0.7457831325301204

In [18]:
# SVC with poly kernel K-Fold Validation(K = 10)#

from sklearn import svm
from sklearn.model_selection import cross_val_score

model = svm.SVC(kernel = 'poly', C=1.0)

scores = cross_val_score(model, all_features_scaled, all_classes, cv = 10)
scores.mean()

0.7903614457831326

In [19]:
# Decision tree with K-Fold Validation(K = 10)#
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
scores = cross_val_score(model, all_features_scaled, all_classes, cv =10)
scores.mean()

0.7385542168674698

In [20]:
#Random Forest with K-Fold Validation(K = 10)#

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
scores = cross_val_score(model, all_features_scaled, all_classes, cv = 10)
scores.mean()

0.7662650602409639

In [21]:
# KNN(10 Neighbors) with train_test_split#

from sklearn import neighbors

X_train, X_test, y_train, y_test = train_test_split(all_features_scaled, all_classes, test_size = 0.25, random_state=1)
model = neighbors.KNeighborsClassifier(n_neighbors = 10).fit(X_train, y_train)
predictions = model.predict(X_test)
ac = accuracy_score(y_test, predictions)
print(ac)

0.7740384615384616


In [22]:
# KNN(10 Neighbors) with K-Fold Validation(K = 10)#

model = neighbors.KNeighborsClassifier(n_neighbors = 10)
scores = cross_val_score(model, all_features_scaled, all_classes, cv =10)
scores.mean()

0.7915662650602409

In [23]:
# KNN(Neighbors between 1 to 50) with K-Fold Validation(K = 10)#
import numpy as np
X =[]
for i in range(1, 50):
    model = neighbors.KNeighborsClassifier(n_neighbors = i)
    scores = cross_val_score(model, all_features_scaled, all_classes, cv =10)
    X.append([scores.mean()])
    print(i, scores.mean())
print(max(X))    

1 0.7325301204819278
2 0.6903614457831325
3 0.7542168674698796
4 0.7349397590361446
5 0.7710843373493976
6 0.7686746987951807
7 0.7951807228915662
8 0.7771084337349398
9 0.7903614457831326
10 0.7915662650602409
11 0.7891566265060241
12 0.783132530120482
13 0.7879518072289157
14 0.7867469879518072
15 0.7867469879518072
16 0.7831325301204819
17 0.7783132530120482
18 0.7783132530120482
19 0.7843373493975904
20 0.7855421686746988
21 0.7879518072289156
22 0.7855421686746988
23 0.7783132530120481
24 0.7783132530120482
25 0.7867469879518072
26 0.7855421686746988
27 0.7855421686746988
28 0.7867469879518072
29 0.7855421686746988
30 0.7903614457831325
31 0.7867469879518072
32 0.789156626506024
33 0.7879518072289156
34 0.7867469879518072
35 0.7831325301204819
36 0.7867469879518072
37 0.7843373493975904
38 0.7867469879518072
39 0.7819277108433734
40 0.7843373493975904
41 0.780722891566265
42 0.7819277108433734
43 0.780722891566265
44 0.7819277108433734
45 0.7831325301204819
46 0.7843373493975904
4

In [24]:
max_score = X[0]
position = 1

for i in range(1, 49):
    if(max_score > X[i]):
        max_score = max_score
        position = position
    else:
        max_score = X[i]
        position = i
        
print('MAX value of mean', max_score, 'Number of neighbors', position)        

MAX value of mean [0.7951807228915662] Number of neighbors 6


In [36]:
# Neural Network using K-Fold Validation #
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import cross_val_score

def model_creation():
    model = Sequential()
    model.add(Dense(4, input_dim = 4, kernel_initializer = 'normal', activation = 'relu'))
    #model.add(Dense(5, kernel_initializer = 'normal', activation = 'relu'))
    model.add(Dense(1, kernel_initializer = 'normal', activation = 'sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [37]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
estimator = KerasClassifier(build_fn = model_creation, epochs = 75, verbose = 0)
scores = cross_val_score(estimator, all_features_scaled, all_classes, cv =10)
scores.mean()

























0.8012048184871674

In [25]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
scores = cross_val_score(model, all_features_scaled, all_classes, cv =10)
scores.mean()

0.8072289156626505