<a href="https://colab.research.google.com/github/0xmeach/Portfolio/blob/main/ML_Breast_Cancer_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Description: This program detects breast cancer, based off of data. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Load the data
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('data.csv')
df.head(7)

In [None]:
#Count the number of rows and columns in the data set
df.shape

In [None]:
#Count the number of empty values (NaN, NAN, na) in each column
df.isna().sum()

In [None]:
#Drop the column with all missing values
df = df.dropna(axis=1)

In [None]:
#get the new count of the number of rows and columns
df.shape

In [None]:
#Get a count of the number of Malignant (M) or Benign (B) cells
df['diagnosis'].value_counts()

In [None]:
#Visualize the count
sns.countplot(df['diagnosis'], label = 'count')

In [None]:
#Look at the data types to see which columns need to be encoded
df.dtypes

In [None]:
#Encode the categorical data values (turn 'M' and 'B' into 1s and 0s)
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
df.iloc[:,1] = labelencoder_Y.fit_transform(df.iloc[:,1].values)



In [None]:
#Create a pair plot

sns.pairplot(df.iloc[:,1:5], hue='diagnosis')

In [None]:
#Print the first 5 rows of the new data
df.head(5)

In [None]:
# Get the correlation of the columns
df.iloc[:,1:12].corr()

In [None]:
# Visualize the correlation
plt.figure(figsize=(10,10))
sns.heatmap(df.iloc[:,1:12].corr(), annot=True, fmt = '.0%')

In [None]:
#Split the data set into independent (X) and dependent (Y) data sets (Y set is the target value - cancer or nah)

X = df.iloc[:,2:31].values
Y = df.iloc[:,1].values

print(X)

In [None]:
#Split the data set into 75% training and 25% testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [None]:
#Scale the data (Feature Scaling)
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)


In [None]:
#Create a function for the models
def models(X_train, Y_train):

  #Logistic Regression
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state=0)
  log.fit(X_train, Y_train)

  #Decision Tree (model 1)
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
  tree.fit(X_train, Y_train)

  #Random Forest Classifier (model 2)
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
  forest.fit(X_train, Y_train)

  #Print the models accuracy on the training data
  print('[0]Logistic Regression Training Accuracy:', log.score(X_train, Y_train))
  print('[1]Decision Tree Classifier Training Accuracy:', tree.score(X_train, Y_train))
  print('[2]Random Forest Classifier Training Accuracy:', forest.score(X_train, Y_train))

  return log, tree, forest

In [None]:
#Getting all of the models
model = models(X_train, Y_train)

In [None]:
#test model accuracy on test data on confusion matrix
from sklearn.metrics import confusion_matrix

for i in range( len(model) ):
  print('Model', i)
  cm = confusion_matrix(Y_test, model[i].predict(X_test))

  TP = cm[0][0]  #true positive
  TN = cm[1][1]  #true negative
  FN = cm[1][0]  #false negative
  FP = cm[0][1]  #false positive

  print(cm) # true positive = 84 and true negative = 50 --- others are false positive and false negatives
  print('Testing Accuracy = ', (TP+TN)/(TP+TN+FN+FP))
  print()

In [None]:
#Show another way to get metrics of the models

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

for i in range( len(model) ):
  print('Model', i)
  print( classification_report(Y_test, model[i].predict(X_test)))
  print( accuracy_score(Y_test, model[i].predict(X_test)))
  print()

In [None]:
#Print the prediction of the Random Forest Classifier Model
pred = model[2].predict(X_test)
print(pred)
print()
print(Y_test)