# **The goal here is to test the written loader, preprocessor and visualizer functions**
# Loaders
* text data loader
* pre trained word embedding loader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install lazypredict

In [None]:
# navigate to root directory of current file in order to access other files relatively
%cd /content/drive/MyDrive/Colab\ Notebooks/project-seraphim/server-side/modelling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utilities.loaders import \
    load_corpus, \
    get_chars, \
    load_lookup_array, \
    save_lookup_array, \
    load_meta_data, \
    save_meta_data, \
    construct_embedding_dict, \
    construct_embedding_matrix \

from utilities.preprocessors import \
    map_value_to_index, \
    remove_contractions, \
    rem_non_alpha_num, \
    capitalize, \
    filter_valid, \
    partition_corpus, \
    rem_stop_words, \
    stem_corpus_words, \
    lemmatize_corpus_words, \
    string_list_to_list, \
    flatten_series_of_lists, \
    sentences_to_avgs, \
    normalize_ratings, \
    normalize_rating_matrix, \
    normalize_train_cross

from utilities.visualizers import \
    plot_train_cross_features, \
    analyze, \
    view_words, \
    data_split_metric_values, \
    view_value_frequency, \
    multi_class_heatmap, \
    view_metric_values, \
    view_classified_labels, \
    view_label_freq, \
    describe_col, \
    visualize_graph, \
    plot_evolution, \
    view_clusters_3d, \
    ModelResults

%load_ext autoreload
%autoreload 2

# Regression

## Ensemble Modelling

In [None]:
import lazypredict
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

# Pipeline is akin to a Sequential class in tf where architecture of model is defined
from sklearn.pipeline import Pipeline

In [None]:
cal_housing = pd.read_csv('./data/cal_housing.data', sep=',', header=None)

# preprocessing X and Y data
X, Y = cal_housing.loc[:, 0:7].to_numpy(), cal_housing.loc[:, 8].to_numpy()
X_trains, X_cross, Y_trains, Y_cross = train_test_split(X, Y, test_size=0.3, random_state=0)
X_trains, X_cross = normalize_train_cross(X_trains, X_cross)

In [None]:
clf = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models_train, predictions_train = clf.fit(X_trains, X_trains, Y_trains, Y_trains)
models_test, predictions_test = clf.fit(X_trains, X_cross, Y_trains, Y_cross)

#### Here the output of the clf.fit() method that fits multiple different models is actually a dataframe containing all the multiple different models we have trained and their respective metric values like Adjusted R-squared, R-squared, RMSE, etc. that we can always access easily through indexing.

In [None]:
models_train

In [None]:
models_train.loc['XGBRegressor', :]

## Individual Modelling

### Linear Regression

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
scaler = StandardScaler()
model = LinearRegression()

poly_model = Pipeline([
    ("engineered features", poly),
    ("input normalizer|standardizer", scaler),
    ("linear regression model", model),
])

poly_model.fit(X_trains, Y_trains)

# Classification

## Ensemble Modelling

In [None]:
from lazypredict.Supervised import LazyClassifier

## Individual Modelling

### Logistic Regression

### Decision Trees, Random Forests, Extreme Gradient Boosted

### Naive Bayes

### Support Vector Machine

# NLP

# Clustering

In [None]:
from utilities.visualizers import plot_evolution
from sklearn.cluster import DBSCAN, KMeans

In [None]:
# read unlabeled data
df = pd.read_csv('./data/E-commerce.csv')

# drop ID and profile information column column
df.drop(columns=['ID', 'profile_information'], inplace=True)
X = df.to_numpy()
K = 3
epochs = 300

In [None]:
df

In [None]:
X.shape

In [None]:
k_means = KMeans(n_clusters=K, max_iter=1, random_state=0)
k_means.fit(X)
print(k_means.cluster_centers_)

In [None]:
k_means = KMeans(n_clusters=K, max_iter=2, random_state=0)
k_means.fit(X)
print(k_means.cluster_centers_)

In [None]:
k_means = KMeans(n_clusters=K, max_iter=1, random_state=0)
k_means.fit(X)
print(k_means.cluster_centers_)

In [None]:
prev_centroids = []
for epoch in range(1, 300 + 1):
    k_means = KMeans(n_clusters=K, max_iter=epoch, random_state=0)
    k_means.fit(X)
    prev_centroids.append(k_means.cluster_centers_)
prev_centroids = np.array(prev_centroids)

In [None]:
len(prev_centroids)

In [None]:
xs_centroids = k_means.predict(X)

In [None]:
np.unique(xs_centroids, return_counts=True)

In [None]:
plot_evolution(X, K, prev_centroids, xs_centroids, features=['n_clicks', 'n_visits', 'amount_spent'], dimension='3d')