In [39]:
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
cancer = load_breast_cancer()

# Print the dataset description
print(cancer.DESCR)



.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [40]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the breast cancer dataset
cancer = load_breast_cancer()

# Create a pandas DataFrame from the dataset
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)

# Add the target column to the DataFrame
df['target'] = cancer.target

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = rf_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.9649122807017544


In [41]:
!pip install --upgrade bottleneck






In [42]:
#Question 2
#What is the class distribution? (i.e. how many instances of malignant (encoded 0) and how many benign (encoded 1)?) This function should return a Series named target of length 2 with integer values and index = ['malignant', 'benign']

import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
cancer = load_breast_cancer()

# Create a pandas DataFrame from the dataset
df = pd.DataFrame(cancer.target, columns=['target'])

# Map target values to 'malignant' and 'benign'
df['target'] = df['target'].map({0: 'malignant', 1: 'benign'})

# Calculate the class distribution
class_distribution = df['target'].value_counts()

print(class_distribution)


target
benign       357
malignant    212
Name: count, dtype: int64


In [43]:
#Question 3
#Split the DataFrame into X (the data) and y (the labels). This function should return a tuple of length 2: (X, y), where X, a pandas DataFrame, has shape (569, 30) y, a pandas Series, has shape (569,).

import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
cancer = load_breast_cancer()

# Create a pandas DataFrame from the dataset
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)

# Create a pandas Series for the labels
y = pd.Series(cancer.target)

# Return a tuple of (X, y)
X = df
print(X.shape, y.shape)



(569, 30) (569,)


In [44]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (455, 30)
X_test shape: (114, 30)
y_train shape: (455,)
y_test shape: (114,)


In [45]:
#Question 5 
#Using KNeighborsClassifier, fit a k-nearest neighbors (knn) classifier with X_train, y_train and using one nearest neighbor (n_neighbors = 1). This function should return a sklearn.neighbors.classification.KNeighborsClassifier.

from sklearn.neighbors import KNeighborsClassifier

# Create a k-nearest neighbors classifier with one nearest neighbor
knn_classifier = KNeighborsClassifier(n_neighbors=1)

# Fit the classifier with X_train and y_train
knn_classifier.fit(X_train, y_train)

# Return the trained classifier
print(knn_classifier)



KNeighborsClassifier(n_neighbors=1)


In [53]:
# 6
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier

# Load the breast cancer dataset
cancer = load_breast_cancer()

# Separate features (X) and target variable (y)
X = cancer.data
y = cancer.target

# Assuming you have a trained KNN classifier named 'knn_classifier'

# Calculate mean values, excluding the target column
mean_values = X.mean(axis=0).reshape(1, -1)  # More explicit approach

# Make prediction using the mean values
prediction = knn_classifier.predict(mean_values)

# Print the predicted class label (either 0 or 1)
print("Predicted class:", prediction[0])




Predicted class: 1




In [51]:
# question 7
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load the breast cancer dataset
cancer = load_breast_cancer()

# Separate features (X) and target variable (y)
X = cancer.data
y = cancer.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create and train a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # Adjust n_neighbors as needed
knn.fit(X_train, y_train)

# Make predictions on the test set
predictions = knn.predict(X_test)

# Ensure the output format
predictions = np.asarray(predictions).reshape(-1)  # Reshape to (143,)
np.clip(predictions, 0, 1, out=predictions)  # Clamp values to 0 or 1

# Print the predictions
print(predictions)


[1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 0 1 0 1 0 0 1 1 1 0 1 0 1 0 1 0 1 1 0 0 1
 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 0 0 1 1
 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 0 0 1 1 0
 1 1 1]


In [49]:
# question 8:

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the breast cancer dataset
cancer = load_breast_cancer()

# Separate features (X) and target variable (y)
X = cancer.data
y = cancer.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create and train a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # Adjust n_neighbors as needed
knn.fit(X_train, y_train)

# Make predictions on the test set
predictions = knn.predict(X_test)

# Calculate accuracy score (mean accuracy)
accuracy = accuracy_score(y_test, predictions)

# Print the accuracy score (between 0 and 1)
print("Accuracy:", accuracy)


Accuracy: 0.956140350877193
