In [2]:
import numpy as np
import pandas as pd

In [13]:
df = pd.read_csv('data.csv')

## EDA

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [9]:
df['Unnamed: 32'].nunique() 

0

In [11]:
# Seems like this column only has Nan values, So lets just drop it. Also id column has no role in the analysis so lets just 
# drop it too.    

In [14]:
df.drop(['Unnamed: 32','id'],axis=1,inplace = True)

In [18]:
df.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


## Training Model

In [19]:
X = df.drop('diagnosis',axis=1)

In [60]:
# Lets get the dummy variables for the diagnosis feature.

In [65]:
Y=pd.get_dummies(df['diagnosis'])

In [66]:
Y.head(1) 

Unnamed: 0,B,M
0,0,1


In [67]:
Y.drop('B',axis=1,inplace=True)  #Malignant cells are cancerous,hence 1 target variable here means the cells are cancerous.

In [91]:
Y.head(2)

Unnamed: 0,M
0,1
1,1


In [96]:
Y=Y.values

In [113]:
Y=np.ravel(Y)

### Splitting the data into training and testing data.

In [116]:
from sklearn.model_selection import train_test_split

In [117]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=.3)

### Trainging the SVM model.

In [118]:
from sklearn.svm import SVC

In [119]:
model = SVC()

In [120]:
model.fit(X_train,y_train)

SVC()

In [121]:
predictions = model.predict(X_test)

### Testing the performance of model.

In [122]:
from sklearn.metrics import classification_report,confusion_matrix

In [123]:
print(confusion_matrix(predictions,y_test))

[[100  15]
 [  2  54]]


In [124]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

           0       0.98      0.87      0.92       115
           1       0.78      0.96      0.86        56

    accuracy                           0.90       171
   macro avg       0.88      0.92      0.89       171
weighted avg       0.92      0.90      0.90       171



### Lets do some hyperparameter tuning.

In [125]:
from sklearn.model_selection import GridSearchCV

In [126]:
param_grid = {'C':[0.1,1,10,100,1000],'gamma':[1,0.1,.01,.001,.0001]}

In [127]:
grid = GridSearchCV(SVC(),param_grid,verbose=3)

In [128]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ....................C=0.1, gamma=1;, score=0.637 total time=   0.0s
[CV 2/5] END ....................C=0.1, gamma=1;, score=0.637 total time=   0.0s
[CV 3/5] END ....................C=0.1, gamma=1;, score=0.637 total time=   0.0s
[CV 4/5] END ....................C=0.1, gamma=1;, score=0.646 total time=   0.0s
[CV 5/5] END ....................C=0.1, gamma=1;, score=0.646 total time=   0.0s
[CV 1/5] END ..................C=0.1, gamma=0.1;, score=0.637 total time=   0.0s
[CV 2/5] END ..................C=0.1, gamma=0.1;, score=0.637 total time=   0.0s
[CV 3/5] END ..................C=0.1, gamma=0.1;, score=0.637 total time=   0.0s
[CV 4/5] END ..................C=0.1, gamma=0.1;, score=0.646 total time=   0.0s
[CV 5/5] END ..................C=0.1, gamma=0.1;, score=0.646 total time=   0.0s
[CV 1/5] END .................C=0.1, gamma=0.01;, score=0.637 total time=   0.0s
[CV 2/5] END .................C=0.1, gamma=0.01

[CV 4/5] END ...................C=1000, gamma=1;, score=0.646 total time=   0.0s
[CV 5/5] END ...................C=1000, gamma=1;, score=0.646 total time=   0.0s
[CV 1/5] END .................C=1000, gamma=0.1;, score=0.637 total time=   0.0s
[CV 2/5] END .................C=1000, gamma=0.1;, score=0.637 total time=   0.0s
[CV 3/5] END .................C=1000, gamma=0.1;, score=0.637 total time=   0.0s
[CV 4/5] END .................C=1000, gamma=0.1;, score=0.646 total time=   0.0s
[CV 5/5] END .................C=1000, gamma=0.1;, score=0.646 total time=   0.0s
[CV 1/5] END ................C=1000, gamma=0.01;, score=0.650 total time=   0.0s
[CV 2/5] END ................C=1000, gamma=0.01;, score=0.637 total time=   0.0s
[CV 3/5] END ................C=1000, gamma=0.01;, score=0.650 total time=   0.0s
[CV 4/5] END ................C=1000, gamma=0.01;, score=0.671 total time=   0.0s
[CV 5/5] END ................C=1000, gamma=0.01;, score=0.646 total time=   0.0s
[CV 1/5] END ...............

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
             verbose=3)

In [129]:
predictions2=grid.predict(X_test)

### Testing performance after tuning

In [130]:
print(confusion_matrix(predictions2,y_test))

[[96  7]
 [ 6 62]]


In [131]:
print(classification_report(predictions2,y_test))

              precision    recall  f1-score   support

           0       0.94      0.93      0.94       103
           1       0.90      0.91      0.91        68

    accuracy                           0.92       171
   macro avg       0.92      0.92      0.92       171
weighted avg       0.92      0.92      0.92       171

