<h2> Import Libraries </h2>

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

<h2> Data Collection and Preprocessing </h2>

In [13]:
dataset = pd.read_csv('data.csv')

In [14]:
dataset.shape

(569, 32)

In [15]:
dataset.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [16]:
dataset.isnull().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [17]:
dataset['diagnosis'].value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

In [21]:
dataset.replace({'diagnosis': {'M':1, 'B':0}}, inplace=True)

<h4> Separating features and target values </h4>

In [22]:
X = dataset.drop(columns=['id', 'diagnosis'], axis=1)
y = dataset['diagnosis']

In [23]:
print(X)
print(y)

     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     compactness_mean  concavity_mean  concave points_mean  sym

<h4> Standardistaion </h4>

In [24]:
scalar = StandardScaler()
scalar.fit(X)
X = scalar.transform(X)

<h4> Train test Split </h4>

In [25]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [26]:
print(X.shape, Xtrain.shape, Xtest.shape)

(569, 30) (455, 30) (114, 30)


<h2> Model Training </h2>

In [27]:
model = LogisticRegression()
model.fit(Xtrain, ytrain)

<h2> Model Evaluation <h2>

In [28]:
trainingPrediction = model.predict(Xtrain)
testPrediction = model.predict(Xtest)

<h4> Accuracy <h4>

In [32]:
print('Training: ', metrics.accuracy_score(ytrain, trainingPrediction))
print('Testing: ', metrics.accuracy_score(ytest, testPrediction) )

Training:  0.989010989010989
Testing:  0.9649122807017544


<h4> Precision <h4>

In [31]:
print('Training: ', metrics.precision_score(ytrain, trainingPrediction))
print('Testing: ', metrics.precision_score(ytest, testPrediction) )

Training:  0.9940119760479041
Testing:  0.9523809523809523


<h2> Predictions </h2>

In [33]:
input = (9.042,18.9,60.07,244.5,0.09968,0.1972,0.1975,0.04908,0.233,0.08743,0.4653,1.911,3.769,24.2,0.009845,0.0659,0.1027,0.02527,0.03491,0.007877,10.06,23.4,68.62,297.1,0.1221,0.3748,0.4609,0.1145,0.3135,0.1055)
stdInput = scalar.transform(np.asarray(input).reshape(1, -1))

p = model.predict(stdInput)

if p[0] == 1:
    print("Malignant Tumor")
else:
    print("Benign Tumor")

Benign Tumor


