In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Standard procedures

Some boilerplate:

In [None]:
dataset_file = ''
df = pd.read_csv(dataset_file, names=['cols'], index_col=0)

print("The dataset size is {}".format(df.shape))
df.head(5)

# Preprocessing

Dropping columns:

In [None]:
df = df.drop(columns=['cols'])

N.B: You should drop columns with a very similar distribution to other coloumns.

Deleting nulls (na) values:

In [None]:
print("There are {} columns with missing values".format(df.isna().sum().sum()))
df = df.dropna()
print("There are {} columns with missing values".format(df.isna().sum().sum()))

Ordinal Enconfing, for when we need to preserve the distance between values.

In [None]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(dtype=int)
column_to_transform = 'col_name' 
df[column_to_transform] = oe.fit_transform(df[column_to_transform].values.reshape(-1,1))
df.head()

OneHotEncoding (it is preferred when dealing with non-ordinal attributes):

MinMax scaling, especially useful in clustering

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_processed = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Plots

Pairplots (collection of scatterplots)

In [None]:
import seaborn as sns
sns.pairplot(df)

Scatterplot

In [None]:
sns.scatterplot(x='col1', y='col2', data=df, hue=y_pred)

# General stats info

# Classification

General classifiers list

In [None]:
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

model_lbls = [
              'dt', 
              'nb', 
              'lp', 
              'svc', 
             'knn',
             'adb',
             'rf',
            ]

# Set the parameters by cross-validation
tuned_param_dt = [{'max_depth': [*range(1,20)]}]
tuned_param_nb = [{'var_smoothing': [10, 1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-07, 1e-8, 1e-9, 1e-10]}]
tuned_param_lp = [{'early_stopping': [True]}]
tuned_param_svc = [{'kernel': ['rbf'], 
                    'gamma': [1e-3, 1e-4],
                    'C': [1, 10, 100, 1000],
                    },
                    {'kernel': ['linear'],
                     'C': [1, 10, 100, 1000],                     
                    },
                   ]
tuned_param_knn =[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]
tuned_param_adb = [{'n_estimators':[20,30,40,50],
                   'learning_rate':[0.5,0.75,1,1.25,1.5]}]
tuned_param_rf = [{'max_depth': [*range(5,15)],
                   'n_estimators':[*range(10,100,10)]}]

models = {
    'dt': {'name': 'Decision Tree       ',
           'estimator': DecisionTreeClassifier(), 
           'param': tuned_param_dt,
          },
    'nb': {'name': 'Gaussian Naive Bayes',
           'estimator': GaussianNB(),
           'param': tuned_param_nb
          },
    'lp': {'name': 'Linear Perceptron   ',
           'estimator': Perceptron(),
           'param': tuned_param_lp,
          },
    'svc':{'name': 'Support Vector      ',
           'estimator': SVC(), 
           'param': tuned_param_svc
          },
    'knn':{'name': 'K Nearest Neighbor ',
           'estimator': KNeighborsClassifier(),
           'param': tuned_param_knn
       },
       'adb':{'name': 'AdaBoost           ',
           'estimator': AdaBoostClassifier(),
           'param': tuned_param_adb
          },
    'rf': {'name': 'Random forest       ',
           'estimator': RandomForestClassifier(),
           'param': tuned_param_rf
          }

}

scores = ['precision', 'recall']

Hyperparameter tuning using cv (on previous defined estimators)

In [None]:
from sklearn.model_selection import GridSearchCV

results_short = {}

for score in scores:
    print('='*40)
    print("# Tuning hyper-parameters for %s" % score)
    print()

    #'%s_macro' % score ## is a string formatting expression
    # the parameter after % is substituted in the string placeholder %s
    for m in model_lbls:
        print('-'*40)
        print("Trying model {}".format(models[m]['name']))
        
        clf = GridSearchCV(models[m]['estimator'], models[m]['param'], cv=5,
                           scoring='%s_macro' % score, 
                           return_train_score = False,
                           n_jobs = 2, # this allows using multi-cores
                           )

        clf.fit(X_train, y_train)
        print_results(clf)
        results_short[m] = clf.best_score_
    print("Summary of results for {}".format(score))
    print("Estimator")
    for m in results_short.keys():
        print("{}\t - score: {:4.2}%".format(models[m]['name'], results_short[m]))

Print results

In [None]:
def print_results(model):
    print("Best parameters set found on train set:")
    print()
    # if best is linear there is no gamma parameter
    print(model.best_params_)
    print()
    print("Grid scores on train set:")
    print()
    means = model.cv_results_['mean_test_score']
    stds = model.cv_results_['std_test_score']
    params = model.cv_results_['params']
    for mean, std, params_tuple in zip(means, stds, params):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params_tuple))
    print()
    print("Detailed classification report for the best parameter set:")
    print()
    print("The model is trained on the full train set.")
    print("The scores are computed on the full test set.")
    print()
    y_true, y_pred = y_test, model.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Clustering

### Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score

k_range = list(range(2,11)) # set the range of k values to test 

parameters = [{'n_clusters': k_range
                    , 'linkage' : ['ward', 'complete', 'average', 'single']}]
pg = list(ParameterGrid(parameters))
result_ac = []

for i in range(len(pg)):
    ac = AgglomerativeClustering(**(pg[i]))
    y_ac = ac.fit_predict(df)
    result_ac.append([pg[i]['linkage'],pg[i]['n_clusters'],silhouette_score(df, y_ac)])


# Dataframe with the results
df_result_ac = pd.DataFrame(data = result_ac, columns=['linkage','n_clusters','silhouette_score'])
df_result_ac.sort_values(by='silhouette_score', ascending=False).head(5)

### DBSCAN method

In [None]:
from sklearn.cluster import DBSCAN

min_points = 2*X.shape[1]

param_grid = {'eps': list(np.arange(0.01, 1, 0.01)), 'min_samples': list(range(min_points-3,min_points+3,1))}
params = list(ParameterGrid(param_grid))

dbscan_out = pd.DataFrame(columns = ['eps','min_samples','silhouette', 'unclust%'])

for i in range(len(params)) : 
    db = DBSCAN(**(params[i]))
    
    y_db = db.fit_predict(X)
    cluster_labels_all = np.unique(y_db)
    n_clusters = len(cluster_labels_all[cluster_labels_all != -1])

    if n_clusters > 1:
        X_cl = X[y_db!=-1,:]
        y_db_cl = y_db[y_db!=-1]
        silhouette = silhouette_score(X_cl,y_db_cl)
        uncl_p = (1 - y_db_cl.shape[0]/y_db.shape[0]) * 100
        dbscan_out.loc[len(dbscan_out)] = [db.eps, db.min_samples, n_clusters, silhouette, uncl_p]

dbscan_out.sort_values(by=['silhouette'], ascending=False).head(10)

Cluster centers (centroid)

In [None]:
cluster_labels = cluster_labels_all[cluster_labels_all != -1]

cluster_centers = np.empty((n_clusters, X.shape[1]))
for i in cluster_labels:
    cluster_centers[i,:] = np.mean(X[y_db==i,:], axis = 0)

### KMeans

In [None]:
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score

k_range = list(range(2,11)) # set the range of k values to test 
results = []

for k_ in k_range : 
    estimator = KMeans(n_clusters=k_)
    y_pred = estimator.fit_predict(df)
    results.append([k_, silhouette_score(df, y_pred), estimator.inertia_])

results = pd.DataFrame(data=results, columns=['n_clusters', 'sil_score', 'inertia'])
results

Plot inertias in KMeans, useful for elbow method

In [None]:
fig, ax = plt.subplots()
ax.plot(k_range, results['inertia'], color='red')
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Inertias', color='red')

ax2 = ax.twinx()
ax2.plot(k_range, results['sil_score'], color='blue')
ax2.set_ylabel('Silhouette scores', color='blue')
ax2.set_ylim(0, 1)

plt.show()

# Association rules

First way to format the transaction list into an items table

In [None]:
df0 = pd.get_dummies(df, prefix='', prefix_sep='', dummy_na=False)
df1 = df0.drop(columns=['Item(s)']) # We drop the items column, as we dont need that information. 

# IMPORTANT!!!!
df1 = df1.groupby(level=0, axis=1).sum()

Second way:

In [None]:
basket = (df
            .groupby(['InvoiceNo', 'Description'])['Quantity'].sum()
            .unstack().reset_index()
            .fillna(0)
            .set_index('InvoiceNo')
)
basket

Apriori computation, find the value of min_support

In [None]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

min_supports = np.arange(0.20, 0.01, step=-0.01)
selected_sup = 0

for sup in min_supports : 
    frequent_itemsets = apriori(df, min_support=sup, use_colnames=True)

    
    itemsets_above_threshold = sum([len(itemset) >= 2 for itemset in frequent_itemsets.itemsets])
    if itemsets_above_threshold >= 8 : 
        selected_sup = sup
        break

    print("min_support: {:0.4f} - number of itemsets with at least 2 items: {}".format(
        sup, itemsets_above_threshold
    ))
    
print("Selected min_support value is: {:0.4f}".format(selected_sup))

In [None]:
min_rules = 10

frequent_itemsets = apriori(df, min_support=selected_sup, use_colnames=True)
min_confidence = np.arange(1, 0.1, step=-0.01)

selected_min_confidence = 0
for mt in min_confidence : 
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=mt)

    print('Metric: "confidence" - min_metric: {:0.4f} - Number of rules: {}'.format(mt, len(rules)))

    if len(rules) >= min_rules : 
        selected_min_confidence = mt
        break;

print("Selected confidence value is: {:0.4f}".format(selected_min_confidence))
