In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

# load data and get the df

In [2]:
data = load_breast_cancer(return_X_y=False) #If True, returns (data, target) instead of a Bunch object.

In [3]:
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [4]:
print(data.feature_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [5]:
print(data.target) 

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 0 1 0 1 1 0 

In [127]:
print(data.target_names) #malignant bad #benign good

['malignant' 'benign']


In [7]:
print(type(data))

<class 'sklearn.utils.Bunch'>


In [8]:
def sklearn_to_df(sklearn_dataset):
    df = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    df['target'] = pd.Series(sklearn_dataset.target)
    return df

In [9]:
df = sklearn_to_df(data)

In [10]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [11]:
df5 = df[['mean radius', 'mean texture','mean perimeter','mean area','mean smoothness']]

In [12]:
display(df5.head())
print(len(df5))

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness
0,17.99,10.38,122.8,1001.0,0.1184
1,20.57,17.77,132.9,1326.0,0.08474
2,19.69,21.25,130.0,1203.0,0.1096
3,11.42,20.38,77.58,386.1,0.1425
4,20.29,14.34,135.1,1297.0,0.1003


569


# correlation

In [13]:
corr_matrix = df5.corr()
corr_matrix

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness
mean radius,1.0,0.323782,0.997855,0.987357,0.170581
mean texture,0.323782,1.0,0.329533,0.321086,-0.023389
mean perimeter,0.997855,0.329533,1.0,0.986507,0.207278
mean area,0.987357,0.321086,0.986507,1.0,0.177028
mean smoothness,0.170581,-0.023389,0.207278,0.177028,1.0


# change data to discrete values

In [14]:
df5.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness
0,17.99,10.38,122.8,1001.0,0.1184
1,20.57,17.77,132.9,1326.0,0.08474
2,19.69,21.25,130.0,1203.0,0.1096
3,11.42,20.38,77.58,386.1,0.1425
4,20.29,14.34,135.1,1297.0,0.1003


In [15]:
def get_min_max_mean(df):
    min_df = df.min()
    max_df = df.max()
    mean_df = round(df.mean(),4)
    median_df = np.median(df)
    return f'Min: {min_df} Max: {max_df} Mean: {mean_df} Median: {median_df}'

In [16]:
print(get_min_max_mean(df5['mean radius']))
print(get_min_max_mean(df5['mean texture']))
print(get_min_max_mean(df5['mean perimeter']))
print(get_min_max_mean(df5['mean area']))
print(get_min_max_mean(df5['mean smoothness']))

Min: 6.981 Max: 28.11 Mean: 14.1273 Median: 13.37
Min: 9.71 Max: 39.28 Mean: 19.2896 Median: 18.84
Min: 43.79 Max: 188.5 Mean: 91.969 Median: 86.24
Min: 143.5 Max: 2501.0 Mean: 654.8891 Median: 551.1
Min: 0.05263 Max: 0.1634 Mean: 0.0964 Median: 0.09587


In [17]:
def discret_of_col(df,col_name): #change continous to discrete > mean = 1
    col = df[col_name]
    value = col.mean()
    for i in range(len(col)):
        if df.at[i, col_name] >= value:
            df.at[i, col_name] = int(1)
        else:
            df.at[i, col_name] = int(0)
            
#all in one solution inspired by alessio
#for col in df.columns:
#    for row in df.index:
#        if df.loc[row,col] >=df[col].mean():
#              df.loc[row,col] = 1
#        else:
#              df.loc[row,col] = 0

In [18]:
discret_of_col(df5,'mean radius')
discret_of_col(df5,'mean texture')
discret_of_col(df5,'mean perimeter')
discret_of_col(df5,'mean area')
discret_of_col(df5,'mean smoothness')

In [19]:
print(df5)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness
0            1.0           0.0             1.0        1.0              1.0
1            1.0           0.0             1.0        1.0              0.0
2            1.0           1.0             1.0        1.0              1.0
3            0.0           1.0             0.0        0.0              1.0
4            1.0           0.0             1.0        1.0              1.0
..           ...           ...             ...        ...              ...
564          1.0           1.0             1.0        1.0              1.0
565          1.0           1.0             1.0        1.0              1.0
566          1.0           1.0             1.0        1.0              0.0
567          1.0           1.0             1.0        1.0              1.0
568          0.0           1.0             0.0        0.0              0.0

[569 rows x 5 columns]


# for each combinition of values what are the chances that target is 1 ??


In [31]:
target_list = df.loc[:,'target']
print(type(target_list))
df5['target'] = target_list # getting SettingWithCopyWarning why ??

<class 'pandas.core.series.Series'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['target'] = target_list


In [33]:
print(df5.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          1.0           0.0             1.0        1.0              1.0   
1          1.0           0.0             1.0        1.0              0.0   
2          1.0           1.0             1.0        1.0              1.0   
3          0.0           1.0             0.0        0.0              1.0   
4          1.0           0.0             1.0        1.0              1.0   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [51]:
# count the ones in each column fpr prob->
df_count = df5.apply(pd.value_counts)
display(df_count)
print(len(df5))

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,target
0.0,343,306,343,365,289,212
1.0,226,263,226,204,280,357


569


In [55]:
def event_probability(event_outcomes, sample_space): # how many events are there #samples 
    probability = event_outcomes/ sample_space
    return round(probability,3)

In [58]:
samples = len(df5)

prob_radius_0 = event_probability(df_count.loc[0,'mean radius'],samples)
prob_radius_1 = 1 -prob_radius_0

prob_texture_0 = event_probability(df_count.loc[0,'mean texture'],samples)
prob_texture_1 = 1 -prob_texture_0

prob_perimeter_0 = event_probability(df_count.loc[0,'mean perimeter'],samples)
prob_perimeter_1 = 1 -prob_perimeter_0

prob_area_0 = event_probability(df_count.loc[0,'mean area'],samples)
prob_area_1 = 1 -prob_area_0

prob_smoothness_0 = event_probability(df_count.loc[0,'mean smoothness'],samples)
prob_smoothness_1 = 1 -prob_smoothness_0

prob_target_0 = event_probability(df_count.loc[0,'target'],samples)
prob_target_1 = 1 -prob_target_0



In [59]:
print(prob_target_1)

0.627


In [87]:
#get all combinations of a 5 column matrix with 0 and 1 as a value
def cartesian_coord(*arrays):
    grid = np.meshgrid(*arrays)        
    coord_list = [entry.ravel() for entry in grid]
    points = np.vstack(coord_list).T
    return points

a = np.arange(2)  # 0,1
prob_list = cartesian_coord(*5*[a])
print(prob_list)

[[0 0 0 0 0]
 [0 0 0 0 1]
 [0 0 0 1 0]
 [0 0 0 1 1]
 [0 0 1 0 0]
 [0 0 1 0 1]
 [0 0 1 1 0]
 [0 0 1 1 1]
 [1 0 0 0 0]
 [1 0 0 0 1]
 [1 0 0 1 0]
 [1 0 0 1 1]
 [1 0 1 0 0]
 [1 0 1 0 1]
 [1 0 1 1 0]
 [1 0 1 1 1]
 [0 1 0 0 0]
 [0 1 0 0 1]
 [0 1 0 1 0]
 [0 1 0 1 1]
 [0 1 1 0 0]
 [0 1 1 0 1]
 [0 1 1 1 0]
 [0 1 1 1 1]
 [1 1 0 0 0]
 [1 1 0 0 1]
 [1 1 0 1 0]
 [1 1 0 1 1]
 [1 1 1 0 0]
 [1 1 1 0 1]
 [1 1 1 1 0]
 [1 1 1 1 1]]


In [99]:
df_prob = pd.DataFrame(prob_list, columns=['mean radius', 'mean texture','mean perimeter','mean area','mean smoothness'])
print(df_prob)
#df_prob['target_prob'] = prob_target_1 #wrong

    mean radius  mean texture  mean perimeter  mean area  mean smoothness
0             0             0               0          0                0
1             0             0               0          0                1
2             0             0               0          1                0
3             0             0               0          1                1
4             0             0               1          0                0
5             0             0               1          0                1
6             0             0               1          1                0
7             0             0               1          1                1
8             1             0               0          0                0
9             1             0               0          0                1
10            1             0               0          1                0
11            1             0               0          1                1
12            1             0         

In [195]:
target_prob_list = []
for row in df_prob.index:
    list_holder = df5[(df5['mean radius'] == df_prob.iloc[row,0])& 
                    (df5['mean texture'] == df_prob.iloc[row,1]) &
                    (df5['mean perimeter'] == df_prob.iloc[row,2]) &
                    (df5['mean area'] == df_prob.iloc[row,3]) &
                    (df5['mean smoothness'] == df_prob.iloc[row,4])]


    sample_count = len(list_holder)

    event_1_or_0 = list_holder['target'].value_counts()
    
    if len(event_1_or_0) != 0:
        if event_1_or_0.index.any() == 1:
            event_prob = event_1_or_0[1]/sample_count
        else:
            event_prob = 0
    else:
        event_prob = 0
    
    target_prob_list.append(event_prob)

In [197]:
df_prob['target_prob_1'] = target_prob_list

In [198]:
print(df_prob)

    mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0             0             0               0          0                0   
1             0             0               0          0                1   
2             0             0               0          1                0   
3             0             0               0          1                1   
4             0             0               1          0                0   
5             0             0               1          0                1   
6             0             0               1          1                0   
7             0             0               1          1                1   
8             1             0               0          0                0   
9             1             0               0          0                1   
10            1             0               0          1                0   
11            1             0               0          1                1   