**Machine Learning can be boiled down to the following steps**:
0. Define Problem
1. Gather Data
2. Clean Data
3. Explore and Prepare Data
5. Form Hypethesis
6. Create Model
7. Train and Test Model
8. Repeat steps 6 and 7 until satisfaction

**Step 0:Define Problem: Detect Breast Cancer in Medical Diagnostic Data**

**Step 1: Gather Data**
1. Go to https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
2. Go to the data folder
3. Copy link from breast-cancer-wisconsin.data
4. Paste link in the Pandas dataframe below

In [None]:
import pandas as pd
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", header=None)

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [None]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,7,8,9,10
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


**Step 2: Clean Data**
Data has missing values
* Find Missing values
* Delete rows with missing values


In [None]:
df.where(df == '?').count()

0      0
1      0
2      0
3      0
4      0
5      0
6     16
7      0
8      0
9      0
10     0
dtype: int64

In [None]:
df.drop(df[df[6] == '?'].index,inplace=True)

In [None]:
df.where(df == '?').count()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
dtype: int64

**Step 3: Explore Data**

In [None]:
df[10].value_counts()

2    444
4    239
Name: 10, dtype: int64

In [None]:
df = df.drop(columns=0)
df.describe()

Unnamed: 0,1,2,3,4,5,7,8,9,10
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,4.442167,3.150805,3.215227,2.830161,3.234261,3.445095,2.869693,1.603221,2.699854
std,2.820761,3.065145,2.988581,2.864562,2.223085,2.449697,3.052666,1.732674,0.954592
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [None]:
class_2 = df.loc[df[10] == 2]

In [None]:
class_4 = df.loc[df[10] == 4]

In [None]:
class_2.describe()

Unnamed: 0,1,2,3,4,5,7,8,9,10
count,444.0,444.0,444.0,444.0,444.0,444.0,444.0,444.0,444.0
mean,2.963964,1.306306,1.414414,1.346847,2.108108,2.083333,1.261261,1.065315,2.0
std,1.672661,0.855657,0.957031,0.917088,0.877112,1.062299,0.954606,0.509738,0.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0
50%,3.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
75%,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
max,8.0,9.0,8.0,10.0,10.0,7.0,8.0,8.0,2.0


In [None]:
class_4.describe()

Unnamed: 0,1,2,3,4,5,7,8,9,10
count,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0
mean,7.188285,6.577406,6.560669,5.585774,5.32636,5.974895,5.857741,2.60251,4.0
std,2.437907,2.724244,2.569104,3.196631,2.443087,2.282422,3.348876,2.564495,0.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0
25%,5.0,4.0,4.0,3.0,3.0,4.0,3.0,1.0,4.0
50%,8.0,6.0,6.0,5.0,5.0,7.0,6.0,1.0,4.0
75%,10.0,10.0,9.0,8.0,6.5,7.0,9.5,3.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [None]:
class_2_balanced = class_2.sample(n = 200)
class_4_balanced = class_4.sample(n = 200)

In [None]:
class_2_balanced.describe()

Unnamed: 0,1,2,3,4,5,7,8,9,10
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,2.91,1.285,1.38,1.325,2.035,2.1,1.3,1.1,2.0
std,1.722421,0.675319,0.81789,0.80786,0.75938,1.022363,1.022363,0.679935,0.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0
50%,3.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
75%,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
max,8.0,4.0,6.0,6.0,8.0,7.0,8.0,8.0,2.0


In [None]:
class_4_balanced.describe()

Unnamed: 0,1,2,3,4,5,7,8,9,10
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,7.195,6.575,6.52,5.535,5.365,6.005,5.895,2.715,4.0
std,2.42412,2.698292,2.569555,3.107581,2.46213,2.240552,3.378022,2.677615,0.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0
25%,5.0,4.0,4.0,3.0,3.0,4.0,3.0,1.0,4.0
50%,8.0,6.0,6.0,5.0,5.0,7.0,6.0,1.0,4.0
75%,10.0,10.0,9.0,8.0,6.25,7.0,10.0,3.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [None]:
classes = pd.concat([class_2_balanced, class_4_balanced])

In [None]:
classes.describe()

Unnamed: 0,1,2,3,4,5,7,8,9,10
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,5.0525,3.93,3.95,3.43,3.7,4.0525,3.5975,1.9075,3.0
std,3.002045,3.297314,3.201269,3.095813,2.467838,2.616651,3.391792,2.111893,1.001252
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,3.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,5.0,3.0,3.0,2.0,2.0,3.0,1.0,1.0,3.0
75%,8.0,6.0,6.0,5.0,5.0,7.0,6.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


**Step 4: Form Hypothesis**
Data Shows clear mathematical separation between the classes,
Logistic Regression should work great

**Step 5: Create Model** Creating Logistic Regression Model

In [None]:
Y = classes[10]
X = classes.drop(10, axis=1)
print(Y.head())
print(X.head())

647    2
535    2
634    2
597    2
508    2
Name: 10, dtype: int64
     1  2  3  4  5  6  7  8  9
647  1  1  1  3  2  1  1  1  1
535  1  1  3  2  2  1  3  1  1
634  3  1  1  1  2  1  1  1  1
597  5  1  3  1  2  1  3  1  1
508  5  1  1  1  2  1  1  1  1


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test, Y_train, Y_test = train_test_split(X,Y, random_state=17)

In [None]:
print(X_train.describe())
print(X_test.describe())
print(Y_train.describe())
print(Y_test.describe())

                1           2           3  ...           7           8           9
count  300.000000  300.000000  300.000000  ...  300.000000  300.000000  300.000000
mean     4.983333    3.846667    3.853333  ...    4.073333    3.620000    1.956667
std      2.966320    3.221451    3.123729  ...    2.699180    3.408959    2.231154
min      1.000000    1.000000    1.000000  ...    1.000000    1.000000    1.000000
25%      3.000000    1.000000    1.000000  ...    2.000000    1.000000    1.000000
50%      5.000000    3.000000    3.000000  ...    3.000000    1.000000    1.000000
75%      8.000000    6.000000    6.000000  ...    7.000000    6.000000    1.000000
max     10.000000   10.000000   10.000000  ...   10.000000   10.000000   10.000000

[8 rows x 8 columns]
                1           2          3  ...           7           8           9
count  100.000000  100.000000  100.00000  ...  100.000000  100.000000  100.000000
mean     5.260000    4.180000    4.24000  ...    3.990000    3.5300

**Step 6: Train and Test Your Model**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log = LogisticRegression()
log.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
log.score(X_test,Y_test)

0.97

In [None]:
log.score(X_train, Y_train)

0.99