# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from scipy.stats import expon, uniform

# Import Data

In [None]:
df = pd.read_csv("C:/Users/lenovo/Desktop/JPH/Cancer_Data.csv")
df.head(5)

In [None]:
df.shape

count of rows and columns

In [None]:
df.columns

# 
We have 33 columns of data
The column "diagnosis" it is our target variable

Cancer Types:
1. Benign cancer (B)
2. Malignant cancer (M)

In [None]:
plt.figure(figsize= (10, 6))
sns.histplot(df['texture_mean'], color = '#0000ff', kde= True);

# Data Cleaning, Correcting, Completing and Converting¶

Null Columns

In [None]:
print('Train columns with null values:\n', df.isnull().sum())

# 
We don't have any null values!

In [None]:
df.info()

This function shows

1.column name

2.non-null values

3.count

4.Dtype

In [None]:
df['Unnamed: 32']

We can see that column 32 represents a column full of NaNs, and it's called "Unnamed:32". We will need to delete that later.

Transforming the target column from categorical to numerical

In [None]:
df['diagnosis'].replace(['B', 'M'],
                        [0, 1], inplace=True)

In [None]:
df.head()

In [None]:
df['diagnosis'].value_counts()

After transforming Categorical to Numerical, number of rows for each type. We can see that 357 + 212 = 569, so all the data has been replaced.

## Exploratory Data Analysis

In [None]:
plt.figure(figsize=(10,6))
sns.set_style("whitegrid")
plt.pie(df['diagnosis'].value_counts(),autopct='%1.2f%%', startangle=90)
plt.axis('equal')
plt.title("B x M")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
ax = sns.countplot(data=df, x='diagnosis')
plt.title('Total B x M cells')
#plt.ylim([0,3000])
ax.bar_label(ax.containers[0], label_type='edge')
plt.show()

Visualizing the categorical data

In [None]:
M = df[df.diagnosis == 1] #Diagnosis transfers all values of M to M data
B = df[df.diagnosis == 0] #Diagnosis transfers all values of B to B data

plt.scatter(M.radius_mean,M.texture_mean, label = "Malignant", alpha = 0.3)
plt.scatter(B.radius_mean,B.texture_mean,label = "Benign", alpha = 0.3)

plt.xlabel("radius_mean")
plt.ylabel("texture_mean")

plt.legend()
plt.show()

scatter plot for texture_mean against radius_mean for cancer types

In [None]:
df.groupby('diagnosis')[['radius_mean','texture_mean']].mean()

Malignant cells have a higher radius

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),cbar=True,annot=True,cmap='Blues')
plt.show()

Heat map for the features in the data set

In [None]:
sns.histplot(df['radius_mean'], color='#0000ff', kde='true')

In [None]:
df.hist(bins=30, figsize=(20, 15), color = '#0000ff')

In [None]:
df.corr()

describing the data set

In [None]:
df.shape

Dropping both "Unnamed: 32" and "Id" columns, to build our model

In [None]:
df = df.drop(labels="Unnamed: 32", axis=1)
df = df.drop(labels="id", axis=1)

In [None]:
df.shape

After dropping the "Unnamed: 32" and "Id" columns

In [None]:
df.head()

In [None]:
X, y = df.drop('diagnosis', axis=1), df[['diagnosis']]

 separating the input features (X) and the target variable (y) from the original DataFrame

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=43, stratify=y)

Train test split

In [None]:
model_dict = {}

In [None]:
# Logistic Regression
classifier = LogisticRegression(random_state=42)
predictor = classifier.fit(X_train, y_train)
y_pred = predictor.predict(X_val)
accuracy_log_reg = accuracy_score(y_val, y_pred)
model_dict['logistic_regression'] = accuracy_log_reg
print(accuracy_log_reg)

Applying the Logistic Regression for the data

In [None]:
model_dict

classifier name and the accuracy

In [None]:
model_accuracies_df = pd.DataFrame(columns=['Model', 'Accuracy'])
model_accuracies_df['Model'] = model_dict.keys()
model_accuracies_df['Accuracy'] = model_dict.values()

Getting the Model and Accuracy values to model_accuracies_df

In [None]:
model_accuracies_df

In [None]:
model_accuracies_df.sort_values(by = "Accuracy", ascending=False)

In [None]:
# Create a bar plot of the model accuracies
plt.figure(figsize=(6, 4))
plt.bar(model_dict.keys(), model_dict.values(), width=0.0000002)
plt.title('Model Accuracies')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim((0.0, 1.5))
plt.xticks(rotation=45, ha='right')
for i, v in enumerate(model_dict.values()):
    plt.text(i, v, str(round(v, 3)), ha='center')
plt.show()
