In [756]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import svm

In [775]:
tumor = pd.read_csv('https://raw.githubusercontent.com/AndrewH707/BrainTumorDetection/main/primary-tumor.csv')

In [776]:
tumor.isnull().sum()

age                  0
sex                  1
histologic-type     65
degree-of-diffe    139
bone                 0
bone-marrow          0
lung                 0
pleura               0
peritoneum           0
liver                0
brain                0
skin                 1
neck                 0
supraclavicular      0
axillar              1
mediastinum          0
abdominal            0
class                0
dtype: int64

In [777]:
column1 = tumor['degree-of-diffe'].fillna('base') #instead of dropping na, we can fill with a base
tumor['degree-of-diffe'] = column1
column2 = tumor['histologic-type'].fillna('unknown')
tumor['histologic-type'] = column2
column3 = tumor['axillar'].fillna(0)
tumor['axillar'] = column3
column4 = tumor['skin'].fillna(0)
tumor['skin'] = column3

In [778]:
tumor['bone'].replace({True: 1, False: 0}, inplace=True)
tumor['bone-marrow'].replace({True: 1, False: 0}, inplace=True)
tumor['lung'].replace({True: 1, False: 0}, inplace=True)
tumor['pleura'].replace({True: 1, False: 0}, inplace=True)
tumor['peritoneum'].replace({True: 1, False: 0}, inplace=True)
tumor['liver'].replace({True: 1, False: 0}, inplace=True)
tumor['brain'].replace({True: 1, False: 0}, inplace=True)
tumor['skin'].replace({True: 1, False: 0}, inplace=True)
tumor['neck'].replace({True: 1, False: 0}, inplace=True)
tumor['supraclavicular'].replace({True: 1, False: 0}, inplace=True)
tumor['axillar'].replace({True: 1, False: 0}, inplace=True)
tumor['mediastinum'].replace({True: 1, False: 0}, inplace=True)
tumor['abdominal'].replace({True: 1, False: 0}, inplace=True)

In [779]:
(tumor.groupby('class').count()).get('age')

class
bladder               2
breast               18
cervix uteri          2
colon                14
corpus uteri          6
duoden and sm.int     1
esophagus             9
gallbladder          15
head and neck        14
kidney               23
liver                 7
lung                 81
ovary                19
pancreas             28
prostate              8
rectum                6
salivary glands       2
stomach              38
testis                1
thyroid              14
vagina                1
Name: age, dtype: int64

In [780]:
tumor['class'].replace({'bladder': 0, 'breast': 1, 'cervix uteri': 2, 'colon': 3, 'corpus uteri': 4, 'duoden and sm.int': 5, 'esophagus': 6, 'gallbladder': 7, 'head and neck': 8, 'kidney': 9, 'liver': 10, 'lung': 11, 'ovary': 12, 'pancreas': 13, 'prostate': 14, 'rectum': 15, 'salivary glands': 16, 'stomach': 17, 'testis': 18, 'thyroid': 19, 'vagina': 20}, inplace=True)

tumor['age'].replace({'<30': 2, '30-59': 1, '>=60': 0}, inplace=True)

tumor['histologic-type'].replace({'unknown': 0, 'adefalse': 1, 'epidermoid': 2, 'anaplastic': 3}, inplace=True)

tumor['sex'].replace({'male': 0, 'female': 1}, inplace=True)

tumor['degree-of-diffe'].replace({'base': 0, 'poorly': 3, 'fairly': 2, 'well': 1}, inplace=True)

In [781]:
tumor = tumor.dropna()

In [782]:
tumor.isnull().sum()

age                0
sex                0
histologic-type    0
degree-of-diffe    0
bone               0
bone-marrow        0
lung               0
pleura             0
peritoneum         0
liver              0
brain              0
skin               0
neck               0
supraclavicular    0
axillar            0
mediastinum        0
abdominal          0
class              0
dtype: int64

In [783]:
tumor_features = ['age', 'sex', 'histologic-type', 'degree-of-diffe', 'bone', 'bone-marrow', 'lung', 'pleura', 'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular', 'axillar', 'mediastinum', 'abdominal']

In [784]:
X = tumor.loc[:,tumor_features].values
Y = tumor.loc[:,['class']]

In [785]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size = 0.3, random_state = 1)
clf_ob1 = svm.SVC(kernel= 'linear', C=1).fit(X_train, np.ravel(Y_train))
print(clf_ob1.score(X_test, Y_test))

0.3763440860215054


In [786]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size = 0.2, random_state = 1)
clf_ob2 = svm.SVC(kernel= 'linear', C=1).fit(X_train, np.ravel(Y_train))
print(clf_ob2.score(X_test, Y_test))

0.41935483870967744


In [787]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size = 0.4, random_state = 1)
clf_ob3 = svm.SVC(kernel= 'linear', C=1).fit(X_train, np.ravel(Y_train))
print(clf_ob3.score(X_test, Y_test))

0.3548387096774194


In [788]:
# Remove classes that are too little
tumor_kfold = tumor[tumor['class'] != 20]
tumor_kfold = tumor_kfold[tumor_kfold['class'] != 18]
tumor_kfold = tumor_kfold[tumor_kfold['class'] != 16]
tumor_kfold = tumor_kfold[tumor_kfold['class'] != 5]
tumor_kfold = tumor_kfold[tumor_kfold['class'] != 2]
tumor_kfold = tumor_kfold[tumor_kfold['class'] != 0]


In [789]:
tumor_kfold.groupby('class').count().get('age')

class
1     18
3     14
4      6
6      9
7     15
8     14
9     23
10     7
11    81
12    19
13    28
14     8
15     6
17    38
19    13
Name: age, dtype: int64

In [790]:
X = tumor_kfold.loc[:,tumor_features].values
Y = tumor_kfold.loc[:,['class']]

In [791]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size = 0.2, random_state = 1)
clf_ob4 = svm.SVC(kernel= 'linear', C=1).fit(X_train, np.ravel(Y_train))
print(clf_ob4.score(X_test, Y_test))

0.4166666666666667


In [792]:
scores_res = model_selection.cross_val_score(clf_ob4, X, np.ravel(Y), cv=5)
scores_res

array([0.43333333, 0.36666667, 0.45      , 0.35      , 0.37288136])