In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# ML Models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression , Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [67]:
# Read data set into pandas data frame.
df = pd.read_csv("Cancer.csv")

In [68]:
# Display first 5 rows of the data
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [69]:
# Check for nulls and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [70]:
# Remove unnamed column as it doesn't give value.
df.drop(['Unnamed: 32'], axis = 1, inplace = True) 

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [72]:
# Check Duplicates
df.duplicated().sum()

0

In [73]:
# Check Correlation
df.corr()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,1.0,0.074626,0.09977,0.073159,0.096893,-0.012968,9.6e-05,0.05008,0.044158,-0.022114,...,0.082405,0.06472,0.079986,0.107187,0.010338,-0.002968,0.023203,0.035174,-0.044224,-0.029866
radius_mean,0.074626,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
texture_mean,0.09977,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,0.071401,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
perimeter_mean,0.073159,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
area_mean,0.096893,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
smoothness_mean,-0.012968,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,0.557775,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
compactness_mean,9.6e-05,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,0.602641,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
concavity_mean,0.05008,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
concave points_mean,0.044158,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661
symmetry_mean,-0.022114,0.147741,0.071401,0.183027,0.151293,0.557775,0.602641,0.500667,0.462497,1.0,...,0.185728,0.090651,0.219169,0.177193,0.426675,0.4732,0.433721,0.430297,0.699826,0.438413


In [74]:
# Drop outliers
mean = df.mean()
std_dev = df.std()
maxx = mean + 3 * std_dev
minn = mean - 3 * std_dev
outliers = df[(df > maxx) | (df < minn)].stack()
df = df.drop(outliers.index.get_level_values(0))

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 487 entries, 1 to 566
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       487 non-null    int64  
 1   diagnosis                487 non-null    object 
 2   radius_mean              487 non-null    float64
 3   texture_mean             487 non-null    float64
 4   perimeter_mean           487 non-null    float64
 5   area_mean                487 non-null    float64
 6   smoothness_mean          487 non-null    float64
 7   compactness_mean         487 non-null    float64
 8   concavity_mean           487 non-null    float64
 9   concave points_mean      487 non-null    float64
 10  symmetry_mean            487 non-null    float64
 11  fractal_dimension_mean   487 non-null    float64
 12  radius_se                487 non-null    float64
 13  texture_se               487 non-null    float64
 14  perimeter_se             4

In [76]:
# Split the data into labels and features
X = df.drop(columns='diagnosis')
y = df[['diagnosis']]
y.head()

Unnamed: 0,diagnosis
1,M
2,M
4,M
5,M
6,M


In [77]:
# Normalize data Using Z-score for better results
for column in X:
    X[column] = (X[column] -X[column].mean()) / X[column].std()    

    
# view normalized data   
display(X)

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
1,-0.422611,2.222574,-0.275188,2.084027,2.478382,-0.810290,-0.397167,0.205598,0.829361,0.151269,...,2.232985,-0.302549,1.954625,2.533921,-0.334161,-0.360026,0.000952,1.369769,-0.181461,0.513141
2,2.571686,1.931377,0.599998,1.943677,2.046082,1.129111,1.578582,1.995202,2.608723,1.267057,...,1.887173,0.070503,1.730729,1.984206,0.641266,1.527909,1.219008,2.352410,1.552452,0.413144
4,2.573749,2.129920,-1.137800,2.190499,2.376458,0.403591,0.919675,2.004920,1.881320,0.138244,...,1.636337,-1.488573,1.720067,1.685980,0.309811,-0.214006,0.924994,0.964645,-0.957000,-0.336837
5,-0.422565,-0.464384,-0.795773,-0.351776,-0.505192,2.548946,1.824153,1.353860,1.159774,1.345206,...,-0.085417,-0.242720,-0.014240,-0.168810,2.284340,2.324667,1.715447,1.164621,2.299863,2.970021
6,-0.422545,1.454872,0.280605,1.440352,1.473197,-0.038742,0.341004,0.623442,0.947410,0.073121,...,1.719137,0.445314,1.755606,1.754972,0.631796,0.203418,0.798988,1.493892,0.447409,0.142317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,-0.419641,0.065066,2.083791,0.074598,-0.071837,0.324798,0.428534,-0.479148,-0.006844,-1.042668,...,-0.126817,1.414898,-0.127966,-0.246482,-0.319956,-0.044180,-0.634911,-0.030064,-1.186045,0.109679
563,-0.419611,2.338391,1.565721,2.572833,2.552190,1.152515,3.127379,3.938664,3.209754,1.614384,...,2.062514,0.753258,2.676069,2.229018,0.466068,1.481088,2.441148,2.545491,0.178181,1.187429
564,-0.419600,2.550171,0.886697,2.524436,3.016122,1.238329,0.508770,2.748294,2.947766,-0.222107,...,2.345009,0.223595,2.214061,2.691937,0.480274,-0.164011,0.987414,1.983489,-1.567787,-0.727798
565,-0.419591,2.076975,2.360430,2.001753,2.249931,0.208559,0.204846,1.130362,1.684367,-0.109226,...,1.916396,2.308815,1.819577,2.033168,-0.675087,-0.315585,0.467056,0.969817,-0.539093,-1.059734


In [78]:
# Change string values to binary so the classifier could understand them
y['diagnosis'] = y['diagnosis'].replace(['M','B'],['1','0'])

In [79]:
# Splitting the data into training and testing
x_train,x_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state= 42)

In [80]:
# Linear Regression Model
cl1 = LinearRegression().fit(x_train,y_train)
cl1_pred = cl1.predict(x_test)

In [81]:
# Ridge Model
cl2 = Ridge(alpha = 1).fit(x_train,y_train)
cl2_pred = cl2.predict(x_test)

In [82]:
# Grid Search Model  using decision tree regressor
params = [{'max_depth':list(range(5,20)),'min_samples_split':list(range(2,15)),'min_samples_leaf':[2,3,4]}]
cl3 = GridSearchCV(estimator = DecisionTreeRegressor(), param_grid=params, cv=10, n_jobs=-1).fit(x_train,y_train)
cl3_pred = cl3.predict(x_test)

In [83]:
# Decision Tree Model
cl4 = DecisionTreeRegressor().fit(x_train,y_train)
cl4_pred = cl4.predict(x_test)

In [84]:
# Random Forest Model
cl5 = RandomForestClassifier().fit(x_train,y_train)
cl5_pred = cl5.predict(x_test)

In [85]:
print("Linear regression r2 score: ",r2_score(y_test,cl1_pred))
print("Linear regression MSE: ",mean_squared_error(y_test,cl1_pred))
print("--------------------------")
print("Ridge r2 score: ",r2_score(y_test,cl2_pred))
print("Ridge MSE: ",mean_squared_error(y_test,cl2_pred))
print("--------------------------")
print("Grid Search r2 score: ",r2_score(y_test,cl3_pred))
print("Grid Search MSE: ",mean_squared_error(y_test,cl3_pred))
print("--------------------------")
print("Decision Tree r2 score: ",r2_score(y_test,cl4_pred))
print("Decision Tree MSE: ",mean_squared_error(y_test,cl4_pred))
print("--------------------------")
print("Random Forest r2 score: ",r2_score(y_test,cl5_pred))
print("Random Forest MSE: ",mean_squared_error(y_test,cl5_pred))

Linear regression r2 score:  0.7155135044341661
Linear regression MSE:  0.06815945713213076
--------------------------
Ridge r2 score:  0.7179051100697802
Ridge MSE:  0.06758645790602205
--------------------------
Grid Search r2 score:  0.6250171423052779
Grid Search MSE:  0.08984126984126983
--------------------------
Decision Tree r2 score:  0.5315080399826162
Decision Tree MSE:  0.11224489795918367
--------------------------
Random Forest r2 score:  0.8722294654498044
Random Forest MSE:  0.030612244897959183


### The best model as we can see is the random forest model with 87% r2 score and a 3% error.