# Modelling

In [33]:
# Importing Libraries
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from causalnex.structure import DAGClassifier
import pandas as pd
import numpy as np
import sys
from typing import Tuple


In [6]:
# DataFrame Splitter
def split_dataframe(df: pd.DataFrame, percentage: float = 0.2) -> Tuple[pd.DataFrame, pd.DataFrame]:
    try:
        df_size = len(df)
        cut_size = df_size - int(df_size * (1 - percentage))
        train_df = df.iloc[:cut_size, :]
        test_df = df.iloc[cut_size:, :]

        return (train_df, test_df)

    except Exception as e:
        print('Failed to Split Dataframe to Train and Test Segments')
        sys.exit(1)


In [14]:
def separate_x_y(df: pd.DataFrame, target_index:int, start_index: int = 0) -> Tuple[pd.DataFrame, pd.DataFrame]:
    x = df.iloc[:, start_index:]
    y = df.iloc[:, target_index]

    return (x, y)


## Loading and Preparing The Dataset for training

### Dataset One

In [7]:
# MinMax Outliear Removed Norm
data_1 = pd.read_csv('../data/out_removed_minmax_scale.csv')
data_1_train, data_1_test = split_dataframe(data_1, percentage=0.2)

In [10]:
print("Total Data Size:\n\t\t",len(data_1))
print("Train Data Size:\n\t\t", len(data_1_train))
print("Test Data Size:\n\t\t", len(data_1_test))


Total Data Size:
		 483
Train Data Size:
		 97
Test Data Size:
		 386


In [18]:
data_1_train_x, data_1_train_y = separate_x_y(data_1_train, 1, 2)
data_1_test_x, data_1_test_y = separate_x_y(data_1_test, 1, 2)

### Dataset Two

In [19]:
# MinMax Outliear Revaluated Norm
data_2 = pd.read_csv('../data/out_revalued_minmax_scale.csv')
data_2_train, data_2_test = split_dataframe(data_2, percentage=0.2)

In [20]:
print("Total Data Size:\n\t\t", len(data_2))
print("Train Data Size:\n\t\t", len(data_2_train))
print("Test Data Size:\n\t\t", len(data_2_test))


Total Data Size:
		 569
Train Data Size:
		 114
Test Data Size:
		 455


In [21]:
data_2_train_x, data_2_train_y = separate_x_y(data_2_train, 1, 2)
data_2_test_x, data_2_test_y = separate_x_y(data_2_test, 1, 2)


## Modelling Using all Feature

### Training on All Features Using Dataset One Using Simple Logistic Regression

In [29]:
names = data_1_train_x.columns.to_list()

lr_model_d1 = LogisticRegression()
scores = cross_val_score(lr_model_d1, data_1_train_x, data_1_train_y, cv=KFold(shuffle=True, random_state=42))
print(f'MEAN Score: {np.mean(scores).mean():.3f}')


MEAN Score: 0.959


In [30]:
lr_model_d1.fit(data_1_train_x, data_1_train_y)


LogisticRegression()

In [32]:
for i in range(lr_model_d1.coef_.shape[0]):
    print("MEAN EFFECT DIRECTIONAL CLASS {}:".format(i))
    print(pd.Series(lr_model_d1.coef_[i, :], index=names).sort_values(ascending=False))


MEAN EFFECT DIRECTIONAL CLASS 0:
concave points_worst          1.206861
texture_mean                  1.018258
texture_worst                 1.001229
perimeter_worst               0.999552
radius_worst                  0.995210
area_worst                    0.938930
concavity_worst               0.906180
concave points_mean           0.874125
perimeter_mean                0.829652
concavity_mean                0.814758
radius_mean                   0.791623
area_mean                     0.754372
area_se                       0.751480
compactness_worst             0.735918
radius_se                     0.715506
perimeter_se                  0.701816
concavity_dispersion_mean     0.633102
fractal_dimension_worst       0.519871
compactness_mean              0.495338
concave points_se             0.411350
symmetry_worst                0.398810
smoothness_worst              0.380628
concavity_se                  0.254372
concavity_dispersion_worst    0.105912
symmetry_mean                 0

### Training on All Features Using Dataset One Using DAG Classifier

In [36]:
dag_model_d1 = DAGClassifier(
    alpha=0.1,
    beta=0.9,
    hidden_layer_units=[5],
    fit_intercept=True,
    standardize=True
)
scores = cross_val_score(dag_model_d1, data_1_train_x,
                         data_1_train_y, cv=KFold(shuffle=True, random_state=42))
print(f'MEAN Score: {np.mean(scores).mean():.3f}')


MEAN Score: 0.918


In [37]:
dag_model_d1.fit(data_1_train_x, data_1_train_y)


DAGClassifier(alpha=0.1, beta=0.9, hidden_layer_units=[5], standardize=True,
              target_dist_type='bin')

In [38]:
for i in range(lr_model_d1.coef_.shape[0]):
    print("MEAN EFFECT DIRECTIONAL CLASS {}:".format(i))
    print(pd.Series(lr_model_d1.coef_[i, :],
                    index=names).sort_values(ascending=False))


MEAN EFFECT DIRECTIONAL CLASS 0:
concave points_worst          1.206861
texture_mean                  1.018258
texture_worst                 1.001229
perimeter_worst               0.999552
radius_worst                  0.995210
area_worst                    0.938930
concavity_worst               0.906180
concave points_mean           0.874125
perimeter_mean                0.829652
concavity_mean                0.814758
radius_mean                   0.791623
area_mean                     0.754372
area_se                       0.751480
compactness_worst             0.735918
radius_se                     0.715506
perimeter_se                  0.701816
concavity_dispersion_mean     0.633102
fractal_dimension_worst       0.519871
compactness_mean              0.495338
concave points_se             0.411350
symmetry_worst                0.398810
smoothness_worst              0.380628
concavity_se                  0.254372
concavity_dispersion_worst    0.105912
symmetry_mean                 0

## Modelling Using Only Directly connected Features

In [39]:
# Declaring Training Used Features
features = ['radius_mean','radius_se'] # Change here

In [40]:
# Selecting Features from Dataset One
data_1_train_x_feat = data_1_train_x[features]

### Training on Selected Features Using Dataset One Using Simple Logistic Regression

### Training on Selected Features Using Dataset One Using DAG Classifier