In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.rcParams.update({'font.size': 10})

**To find significant geometrical aspects to distinguish between malignant and beningn tumours we need to find features in the dataframe that have a significant difference in their median values**

In [None]:
data = pd.read_csv('Cancergeometry.csv')
y = data.diagnosis                       
x = data.drop(['id','diagnosis'],axis = 1 )
x.head()

In [None]:
data_dia = y # labels (M or B) for diagnosis
data = x # features for tumour types
data_n_2 = (data - data.mean()) / (data.std())  # Normalize the values to fit all values in the same plot

# Violin Plot

In [None]:
fig, ax = plt.subplots(figsize=(20,8)) # Make a figure and a set of subplots
data = pd.concat([y,data_n_2],axis=1) # Concatenate the dataframes so that the diagnosis column is next to the features
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')  # Melt the data into a long format so that seaborn can make a violin plot
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart") # Make a violin plot
plt.xticks(rotation=90); # Rotate the x-axis labels so that they are easier to read

# Box Plot

In [None]:
fig, ax = plt.subplots(figsize=(20,8)) # Make a figure and a set of subplots
sns.boxplot(x="features", y="value", hue="diagnosis", data=data) # Make a boxplot
plt.xticks(rotation=90); # Rotate the x-axis labels so that they are easier to read

**The features that have a significant difference in median for B and M labels are:** 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst','smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst','radius_se', 'perimeter_se', 'area_se', 'compactness_se', 'concavity_se', 'concave points_se', 'fractal_dimension_se','radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean','smoothness_mean', 'compactness_mean', 'concavity_mean','concave points_mean', 'symmetry_mean'

# Pair Plot

In [None]:
sns.pairplot(x)

# Correlation Heatmap

In [None]:
fig, ax = plt.subplots(figsize=(20,8)) # Make a figure and a set of subplots
corr = x.corr() # Calculate the correlation between the features
matrix = np.triu(corr) # Make a mask for the upper triangle
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns,annot=True,mask=matrix) # Make a heatmap

There is highly significant correlation observed between: 
- 'radius_mean and perimeter_mean',
- 'radius_mean and area_mean',
-  'radius_mean and radius_worst',
-  'radius_mean and perimeter_worst',
-  'perimeter_mean and area_mean',
-  'perimeter_mean and perimeter_worst',
-  'area_mean and radius_worst',
-  'area_mean and perimeter_worst',
-  'area_mean and area_worst',
- 'radius_worst and perimeter_worst',
-  'radius_worst and area_worst',
-  'perimeter_worst and area_worst'