In [None]:
# !python --version
# !pip install

In [None]:
import sklearn
import sklearn.datasets
# breast_data = sklearn.datasets.load_breast_cancer()

In [None]:
import pandas as pd
# data = pd.DataFrame(data=breast_data.data, columns=breast_data.feature_names)
# data['target'] = breast_data.target

# data.to_csv('breast_data.csv', sep=';', encoding='utf8',index=None)


data = pd.read_csv('breast_data.csv', sep=',', encoding='utf8')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
nans = data.isna()
n_nans = nans.sum(axis = 0)
print(n_nans)

In [None]:
target = 'target'
feature_names = list(data.columns.values)
feature_names.remove(target)
print(target)
print(feature_names)
print(len(feature_names))

In [None]:
classes = data[target].unique()
print(classes)

In [None]:
data[target].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True,figsize=(4,4))

In [None]:
import seaborn as sns
# sns.pairplot(data,hue = target)

In [None]:
x = data[feature_names].corr()
print(x)

In [None]:
sns.heatmap(x)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("darkgrid")

fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(12, 18))  

features = feature_names[:10]

for i, feature in enumerate(features):
    row, col = divmod(i, 2)  
    sns.histplot(data=data, x=feature, hue=target, ax=ax[row, col])  
    ax[row, col].set_title(f"Distribution of {feature}")

plt.tight_layout(pad=5.0, w_pad=1.5, h_pad=2.5)
plt.show()

In [None]:
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

pca = PCA(n_components=2)
components = pca.fit_transform(data[feature_names])

pca_data = pd.DataFrame(data=components, columns=['PC1', 'PC2'])
pca_data[target] = data[target].values

# fig = plt.scatter(components[:, 0], components[:, 1], color=data[target])
# fig.show()

import seaborn as sns
sns.set_theme()

sns.lmplot(
    x='PC1', 
    y='PC2', 
    data=pca_data, 
    hue=target, 
    fit_reg=False, 
    legend=True
    )

plt.title('2D PCA Graph')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data[feature_names],
                                                    data[target],
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    shuffle=True)

x_valid, x_test, y_valid, y_test = train_test_split(x_test,
                                                    y_test,
                                                    train_size=0.6,
                                                    test_size=0.4,
                                                    shuffle=True)
print('train:')
print(len(x_train))
print(len(y_train))

print('valid:')
print(len(x_valid))
print(len(y_valid))

print('test:')
print(len(x_test))
print(len(y_test))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

clf_reg = LogisticRegression(random_state=0, solver='liblinear').fit(x_train, y_train)

In [None]:
predicted = clf_reg.predict(x_train)
print(metrics.classification_report(y_train, predicted))
print(metrics.confusion_matrix(y_train, predicted))

In [None]:
def get_pred_stat(clf, train=False):
    if train:
        print('---------TRAIN--------')
        predicted = clf.predict(x_train)
        print(metrics.classification_report(y_train, predicted))
        print(metrics.confusion_matrix(y_train, predicted))
    
    print('---------VALID--------')
    predicted = clf.predict(x_valid)
    print(metrics.classification_report(y_valid, predicted))
    print(metrics.confusion_matrix(y_valid, predicted))
    
    print('---------TEST--------')
    predicted = clf.predict(x_test)
    print(metrics.classification_report(y_test, predicted))
    print(metrics.confusion_matrix(y_test, predicted))

In [None]:
get_pred_stat(clf_reg)

In [None]:
clf_reg_2 = LogisticRegression(random_state=0, solver='newton-cg').fit(x_train, y_train)
get_pred_stat(clf_reg_2)