
<h1 id="Final-Project:-Breast-Cancer-Data-Set">Final Project: Breast Cancer Data Set<a class="anchor-link" href="#Final-Project:-Breast-Cancer-Data-Set">¶</a></h1>



<h3 id="Imports">Imports<a class="anchor-link" href="#Imports">¶</a></h3>


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
%matplotlib inline




<h3 id="Check-Filepath">Check Filepath<a class="anchor-link" href="#Check-Filepath">¶</a></h3>


In [None]:

import os
print(os.getcwd())




<h3 id="Read-CSV-File">Read CSV File<a class="anchor-link" href="#Read-CSV-File">¶</a></h3>


In [None]:

cancerData = pd.read_csv('data.csv')
cancerData.head()




<h3 id="Format-and-Improve-Data">Format and Improve Data<a class="anchor-link" href="#Format-and-Improve-Data">¶</a></h3>


In [None]:

cancerData.drop('id', axis=1, inplace=True)
cancerData.drop('Unnamed: 32', axis=1, inplace=True)
cancerData['diagnosis'] = cancerData['diagnosis'].map({'M': 1, 'B': 0})
cancerData.isnull().sum()




<h3 id="Get-More-Info-on-Data">Get More Info on Data<a class="anchor-link" href="#Get-More-Info-on-Data">¶</a></h3>


In [None]:

cancerData.describe()




<h3 id="Find-Possible-Trends">Find Possible Trends<a class="anchor-link" href="#Find-Possible-Trends">¶</a></h3>


In [None]:

plt.subplots(figsize=(20,15))
graph = sns.heatmap(cancerData.corr(), xticklabels=1, yticklabels=1, annot=True)




<h2 id="4-Trends">4 Trends<a class="anchor-link" href="#4-Trends">¶</a></h2>



<h3 id="Concave-Points-vs.-Concavity">Concave Points vs. Concavity<a class="anchor-link" href="#Concave-Points-vs.-Concavity">¶</a></h3>


In [None]:

sns.set_style("darkgrid", {"axes.facecolor": ".9"})
graph = sns.jointplot(y='concave points_mean',x='concavity_mean', data=cancerData, dropna=True, kind='reg')
graph.set_axis_labels('Concavity', 'Concave Points',fontsize=14)




<h3 id="Compactness-vs-Concave-Points">Compactness vs Concave Points<a class="anchor-link" href="#Compactness-vs-Concave-Points">¶</a></h3>


In [None]:

sns.set_style("darkgrid", {"axes.facecolor": ".9"})
graph = sns.jointplot(y='concave points_mean',x='compactness_mean', data=cancerData, dropna=True, kind='reg')
graph.set_axis_labels('Concave Points', 'Compactness',fontsize=14)




<h3 id="Correlation-between-Concave-Points-and-the-Final-Diagnosis">Correlation between Concave Points and the Final Diagnosis<a class="anchor-link" href="#Correlation-between-Concave-Points-and-the-Final-Diagnosis">¶</a></h3>


In [None]:

cancerData[cancerData['diagnosis']==0]['concave points_mean'].hist(bins=30,color='green', alpha = .5, label='Benign')
cancerData[cancerData['diagnosis']==1]['concave points_mean'].hist(bins=30,color='red', alpha = .5, label='Malignant')
plt.legend()
plt.title('Concave Points Histogram')
plt.xlabel('Concave Points')
plt.ylabel('Frequency')




<h3 id="Correlation-between-Radius-and-the-Final-Diagnosis">Correlation between Radius and the Final Diagnosis<a class="anchor-link" href="#Correlation-between-Radius-and-the-Final-Diagnosis">¶</a></h3>


In [None]:

cancerData[cancerData['diagnosis']==0]['radius_mean'].hist(bins=30,color='green', alpha = .5, label='Benign')
cancerData[cancerData['diagnosis']==1]['radius_mean'].hist(bins=30,color='red', alpha = .5, label='Malignant')
plt.legend()
plt.title('Radius Histogram')
plt.xlabel('Radius')
plt.ylabel('Frequency')




<h1 id="Linear-Regression-of-Data">Linear Regression of Data<a class="anchor-link" href="#Linear-Regression-of-Data">¶</a></h1>



<p><strong>Set up Arrays</strong></p>


In [None]:

#X = cancerData.drop(['diagnosis'],axis=1)
#This data makes the most effective predictors (most correlation with diagnosis)
X = cancerData[['concave points_mean','concave points_worst','radius_mean','radius_worst','perimeter_mean','perimeter_worst']]
y = cancerData['diagnosis']
X.head()




<p><strong>Train</strong></p>


In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)




<p><strong>Coefficients</strong></p>


In [None]:

coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df




<p><strong>Make predictions</strong></p>


In [None]:

sns.set_style("darkgrid", {"axes.facecolor": ".9"})
plt.scatter(y_test,lm.predict(X_test))
plt.xlabel('Actual Value', fontsize=14)
plt.ylabel('Predicted Value', fontsize=14)



In [None]:

print(lm.predict(X_test)[:100])




<p><strong>Residual Histogram</strong></p>


In [None]:

sns.set_style("darkgrid", {"axes.facecolor": ".9"})
graph = sns.distplot((y_test-lm.predict(X_test)),bins=50)
plt.suptitle('Residual Histogram', fontsize=14)
plt.xlabel('Diagnosis', fontsize=14)




<p><strong>Check Error</strong></p>


In [None]:

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, lm.predict(X_test)))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, lm.predict(X_test)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, lm.predict(X_test))))




<h2 id="First-Categorical-Technique:-Logistic-Regression">First Categorical Technique: Logistic Regression<a class="anchor-link" href="#First-Categorical-Technique:-Logistic-Regression">¶</a></h2>


In [None]:

from sklearn.model_selection import train_test_split



In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)



In [None]:

from sklearn.linear_model import LogisticRegression



In [None]:

logmodel = LogisticRegression()



In [None]:

logmodel.fit(X_train, y_train)



In [None]:

predictions = logmodel.predict(X_test)
predictions[:200]



In [None]:

sns.distplot((y_test-logmodel.predict(X_test)),bins=50)



In [None]:

sns.set_style("darkgrid", {"axes.facecolor": ".9"})
plt.xlabel('Actual Value', fontsize=14)
plt.ylabel('Predicted Value', fontsize=14)
plt.scatter(y_test,logmodel.predict(X_test))



In [None]:

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))



In [None]:

print(confusion_matrix(y_test,predictions))




<h2 id="Second-Categorical-Technique:-SVMs">Second Categorical Technique: SVMs<a class="anchor-link" href="#Second-Categorical-Technique:-SVMs">¶</a></h2>


In [None]:

from sklearn.model_selection import train_test_split



In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y)



In [None]:

from sklearn.svm import SVC



In [None]:

model = SVC()



In [None]:

model.fit(X_train,y_train)



In [None]:

from sklearn.metrics import classification_report,confusion_matrix



In [None]:

predictions = model.predict(X_test)
predictions[:200]



In [None]:

sns.distplot((y_test-model.predict(X_test)),bins=50)



In [None]:

plt.scatter(y_test,model.predict(X_test))
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
plt.xlabel('Actual Value', fontsize=14)
plt.ylabel('Predicted Value', fontsize=14)



In [None]:

print(classification_report(y_test,predictions))



In [None]:

print(confusion_matrix(y_test,predictions))




<h2 id="Third-Categorical-Technique:-Decision-Tree">Third Categorical Technique: Decision Tree<a class="anchor-link" href="#Third-Categorical-Technique:-Decision-Tree">¶</a></h2>


In [None]:

from sklearn.tree import DecisionTreeClassifier



In [None]:

dtree = DecisionTreeClassifier()



In [None]:

dtree.fit(X_train, y_train)



In [None]:

from sklearn.metrics import classification_report, confusion_matrix



In [None]:

predict = dtree.predict(X_test)
predict[:200]



In [None]:

sns.distplot((y_test-dtree.predict(X_test)),bins=50)



In [None]:

plt.scatter(y_test,dtree.predict(X_test))
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
plt.xlabel('Actual Value', fontsize=14)
plt.ylabel('Predicted Value', fontsize=14)



In [None]:

print(classification_report(y_test,predict))



In [None]:

print (confusion_matrix(y_test,predict))

