In [1]:
""" IMPORTS """
import pandas as pd
import numpy as np
import math
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_excel("./datasetTest_2features.xlsx")

------

DATA CLEANING   -   FUNCTIONS

Removing outliers

In [14]:
"""
    For each column, it first computes the Z-score of each value in the column, relative to the column mean and standard deviation.
    It then takes the absolute Z-score because the direction does not matter, only if it is below the threshold.
    all(axis=1) ensures that for each row, all column satisfy the constraint.
    Finally, the result of this condition is used to index the dataframe
"""
def remove_outliers_v1(df):
    return df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

def remove_outliers_v2(df):
    for column in df:
        if(column == 0):
            pass
        else:
            q_low = df[column].quantile(0.05)
            q_high  = df[column].quantile(0.95)
            df_ = df[(df[column] > q_low) & (df[column] < q_high)]
    return df_

Removing duplicates

Fill missing values with imputation

In [33]:
def imputing_miss_values(df):
    # print total missing
    print('Missing: %d' % sum(np.isnan(df.values).flatten()))
    # define imputer
    imputer = SimpleImputer(strategy='mean')
    # fit on the dataset
    imputer.fit(df)
    # transform the dataset
    df_ = imputer.transform(df)
    # print total missing
    print('Missing after imputation: %d' % sum(np.isnan(df_).flatten()))
    df_ = pd.DataFrame(df_)
    return df_

--------

NORMALIZATION

In [74]:
# Using The maximum absolute scaling
# The maximum absolute scaling rescales each feature between -1 and 1
def max_scaling(X):
    X_max_scaled = X.copy()
    for column in X:
        X_max_scaled[column] = X_max_scaled[column] / X_max_scaled[column].abs().max()
    return X_max_scaled

# Using The min-max feature scaling
# The min-max approach (often called normalization) rescales the feature to a hard and fast range of [0,1]
def min_max_scaling(X):
    X_min_max_scaled = X.copy()
    for column in X:
        X_min_max_scaled[column] = (X_min_max_scaled[column] - X_min_max_scaled[column].min()) / (X_min_max_scaled[column].max() - X_min_max_scaled[column].min())
    return X_min_max_scaled


# Using The z-score method
# The z-score method (often called standardization) transforms the info into distribution with a mean of 0 and a typical deviation of 1
def z_scaling(X):
    X_z_scaled = X.copy()
    for column in X:
        X_z_scaled[column] = (X_z_scaled[column] - X_z_scaled[column].mean()) / X_z_scaled[column].std()
    return X_z_scaled

def get_train_mean(X_train):
    train_mean_list = []
    for column in X_train:
        train_mean_list.append(X_train[column].mean())
    return train_mean_list

def get_train_std(X_train):
    train_std_list = []
    for column in X_train:
        train_std_list.append(X_train[column].std())
    return train_std_list

def scaling(X_train, X_test, scaler): # Standard or MinMax
    # Get scaling parameters with the train sample exclusively, using the Scaler.fit() function
    scaler.fit(X_train)
    # Scale data using Scaler.transform()
    X_train_scaled = pd.DataFrame(scaler.transform(X_train))
    X_test_scaled = pd.DataFrame(scaler.transform(X_test))
    return X_train_scaled, X_test_scaled

----

DATA PREPARATION - APPLICATION

In [47]:
df = dataset.copy()
column = df['action_taken'].replace(3, 0)
df['action_taken'] = column.values

# features to consider removing
considered_features = []
for column in df:
    considered_features.append(column)

DATA CLEANING - APPLICATION

In [48]:
""" Removing outliers """
df_no_outliers = remove_outliers_v2(df)

""" Removing duplicates """
df_no_outliers_and_duplicates = df_no_outliers.drop_duplicates()

""" Counting nan """
df_no_outliers_and_duplicates['action_taken'].isnull().sum(axis = 0)

0

In [75]:
""" Imputing missing values """
df_ = imputing_miss_values(df_no_outliers_and_duplicates)

Missing: 19092
Missing after imputation: 0


In [76]:
""" Spliting dataset """
# define dataset
X, y = df_.drop(columns=0), df_[[0]]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [77]:
# https://www.datatrigger.org/post/scaling/

""" Normalizing data """
# Instantiate a Scaler --> Standard or MinMaxScaler()
scaler = StandardScaler()
X_train_scaled, X_test_scaled = scaling(X_train, X_test, scaler)

----------

1

https://machinelearningmastery.com/implement-logistic-regression-stochastic-gradient-descent-scratch-python/

Making Predictions

In [94]:
np.random.seed(1)

to_add = y_train.values
dataset = X_train_scaled.copy()
dataset['action_taken'] = to_add
dataset = dataset.values.tolist()

mu, sigma = 0, 1 # mean and standard deviation
s = np.random.normal(mu, sigma, 13)

In [95]:
""" Logistic Regression from scratch """

# Make a prediction with coefficients
def predict(row, coefficients):
	yhat = coefficients[0]
	for i in range(len(row)-1):
		yhat += coefficients[i + 1] * row[i]
	return 1.0 / (1.0 + math.exp(-yhat))

In [96]:
coef = s.tolist()
good_pred, bad_pred = 0, 0
for row in dataset:
	yhat = predict(row, coef)
	if(row[-1] == round(yhat)):
		good_pred += 1
	else:
		bad_pred += 1
print("Good pred: {}".format(good_pred))
print("Bad pred: {}".format(bad_pred))
print("Good pred = {} %".format(round(good_pred/(good_pred+bad_pred), 2)))

Good pred: 129796
Bad pred: 62943
Good pred = 0.67 %


Estimating Coefficients

In [97]:
# Estimate logistic regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
	coef = [0.0 for i in range(len(train[0]))]
	for epoch in range(n_epoch):
		sum_error = 0
		for row in train:
			yhat = predict(row, coef)
			error = row[-1] - yhat
			sum_error += error**2
			coef[0] = coef[0] + l_rate * error * yhat * (1.0 - yhat)
			for i in range(len(row)-1):
				coef[i + 1] = coef[i + 1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
		print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
	return coef

In [98]:
# Calculate coefficients
l_rate = 0.3
n_epoch = 100
coef = coefficients_sgd(dataset, l_rate, n_epoch)
print(coef)

>epoch=0, lrate=0.300, error=24193.779
>epoch=1, lrate=0.300, error=24080.785
>epoch=2, lrate=0.300, error=24090.939
>epoch=3, lrate=0.300, error=24093.168
>epoch=4, lrate=0.300, error=24090.192
>epoch=5, lrate=0.300, error=24129.493
>epoch=6, lrate=0.300, error=24091.614
>epoch=7, lrate=0.300, error=24138.705
>epoch=8, lrate=0.300, error=24086.164
>epoch=9, lrate=0.300, error=24139.839
>epoch=10, lrate=0.300, error=24088.051
>epoch=11, lrate=0.300, error=24090.763
>epoch=12, lrate=0.300, error=24089.714
>epoch=13, lrate=0.300, error=24090.680
>epoch=14, lrate=0.300, error=24090.663
>epoch=15, lrate=0.300, error=24102.695
>epoch=16, lrate=0.300, error=24088.665
>epoch=17, lrate=0.300, error=24090.320
>epoch=18, lrate=0.300, error=24095.957
>epoch=19, lrate=0.300, error=24089.305
>epoch=20, lrate=0.300, error=24087.304
>epoch=21, lrate=0.300, error=24087.945
>epoch=22, lrate=0.300, error=24106.731
>epoch=23, lrate=0.300, error=24102.185
>epoch=24, lrate=0.300, error=24089.820
>epoch=25,

In [99]:
from random import randrange
# Find the min and max values for each column
def dataset_minmax(dataset):
	minmax = list()
	for i in range(len(dataset[0])):
		col_values = [row[i] for row in dataset]
		value_min = min(col_values)
		value_max = max(col_values)
		minmax.append([value_min, value_max])
	return minmax

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

# Make a prediction with coefficients
def predict(row, coefficients):
	yhat = coefficients[0]
	for i in range(len(row)-1):
		yhat += coefficients[i + 1] * row[i]
	return 1.0 / (1.0 + math.exp(-yhat))
 
# Estimate logistic regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
	coef = [0.0 for i in range(len(train[0]))]
	for epoch in range(n_epoch):
		for row in train:
			yhat = predict(row, coef)
			error = row[-1] - yhat
			coef[0] = coef[0] + l_rate * error * yhat * (1.0 - yhat)
			for i in range(len(row)-1):
				coef[i + 1] = coef[i + 1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
	return coef
 
# Linear Regression Algorithm With Stochastic Gradient Descent
def logistic_regression(train, test, l_rate, n_epoch):
	predictions = list()
	coef = coefficients_sgd(train, l_rate, n_epoch)
	for row in test:
		yhat = predict(row, coef)
		yhat = round(yhat)
		predictions.append(yhat)
	return(predictions)

In [100]:
# normalize
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 5
l_rate = 0.1
n_epoch = 100
scores = evaluate_algorithm(dataset, logistic_regression, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [80.46540586816094, 81.08542817858718, 81.18400913170935, 80.96090486938023, 80.59252341297636]
Mean Accuracy: 80.858%


------

2

In [104]:
# fit the model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_scaled, y_train.values.ravel())   #.values will give the values in a numpy array (shape: (n,1)) 
                                                    #.ravel will convert that array shape to (n, ) (i.e. flatten it)
# evaluate the model
yhat = model.predict(X_test_scaled)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.26


3

In [105]:
# get a list of models to evaluate
def get_models():
	models = dict()
	for p in [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0]:
		# create name for model
		key = '%.4f' % p
		# turn off penalty in some cases
		if p == 0.0:
			# no penalty in this case
			models[key] = LogisticRegression(solver='lbfgs', penalty='none')
		else:
			models[key] = LogisticRegression(solver='lbfgs', penalty='l2', C=p)
	return models

# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
	# define the evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=12, n_repeats=3, random_state=1)
	# evaluate the model
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	# evaluate the model and collect the scores
	scores = evaluate_model(model, X_train_scaled, y_train)
	# store the results
	results.append(scores)
	names.append(name)
	# summarize progress along the way
	print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
# pyplot.boxplot(results, labels=names, showmeans=True)
# pyplot.show()

>0.0000 0.813 (0.003)
>0.0001 0.763 (0.001)
>0.0010 0.806 (0.003)
>0.0100 0.815 (0.003)
>0.1000 0.814 (0.003)
>1.0000 0.813 (0.003)
