# Using TS Fresh to extract features on our dataset and classify using multiple logistic regression. 

## Imports

In [2]:
import warnings
import pandas as pd
import joblib

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression as LR
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score as AUC, accuracy_score as accuracy, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report


from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, cross_validate
import sys
sys.path.insert(0, '../.')
from Utils import *
import pyarrow as pa
import pyarrow.parquet as pq
import json
from IPython.display import clear_output
import os

import re


Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [4]:
data_dir = "../../../../MCM1_Practicum_Data/"
meta_data_dir = data_dir+"Metadata/"
binned_data_dir=data_dir+"Pre-Processed_Data/data-ms-aggregates/"
lr_dir = data_dir+"Pre-Processed_Data/LR/"



## Data Transformation

In [3]:
video_ids_map_fn = meta_data_dir+"all_ids_dict.json"
with open(video_ids_map_fn, "r") as json_file:
    nums_to_vids_map = json.load(json_file)
vids_to_nums_map = {v:k for k,v in nums_to_vids_map.items()}

In [5]:
# Question, what is the best Binsize to predict a video using the DF?
source_data_files =  os.listdir(binned_data_dir)#[0:5]

# Get files for creating features
feature_of_interest = "DF"
feature_of_interest_folder = lr_dir+feature_of_interest+ "/"
features_folder = feature_of_interest_folder +"Features/"
train_folder = feature_of_interest_folder+"Train/"
test_folder = feature_of_interest_folder+"Test/"
models_folder = feature_of_interest_folder+"Models/"
evaluation_folder = feature_of_interest_folder+"Evaluation/"

validation_proportion = 0.3
try:
    os.mkdir(feature_of_interest_folder)
    os.mkdir(features_folder)
    os.mkdir(train_folder)
    os.mkdir(test_folder)
    os.mkdir(evaluation_folder)
    os.mkdir(models_folder)
except: pass

already_modelled_bins = set([fn[5:9] for fn in os.listdir(evaluation_folder)])
bins_to_model = set([fn[5:9] for fn in source_data_files]) - already_modelled_bins

to_model_data_files = []
for fn in source_data_files:
    for bin_n in bins_to_model:
        if str.__contains__(fn, bin_n):
            to_model_data_files.append(fn)

to_model_data_files = to_model_data_files
fdr_level = 0.05	
print("To model: ",+len(to_model_data_files))


 # config
# for test_fn in to_model_data_files:
#     test_fp = binned_data_dir + test_fn
#     features_file = features_folder+test_fn[:-8]+"-"+feature_of_interest+"-features.csv"
#     train_file =  train_folder+test_fn[:-8]+"-"+feature_of_interest+"-train.csv"
#     test_file =  test_folder+test_fn[:-8]+"-"+feature_of_interest+"-test.csv"
#     df, target = transform_data(test_fp, feature_of_interest)
#     extract_features_from_df(df, target, features_file)
#     select_features_from_file(features_file, train_file, test_file, validation_proportion, fdr_level )
#     train(train_fn, evaluation_folder):
#     evaluate(model, test_file, evaluation_folder, vids_to_nums_map)

To model:  0


In [6]:
def read_parquet(fp):
    parquet_table = pq.read_table(fp)
    return parquet_table.to_pandas()

def transform_data(data:pd.DataFrame, feature_of_interest):
    
    cn = [feature_of_interest, "video_id"]
    data = data[cn]
    data["video_id"] = data["video_id"].map(vids_to_nums_map)
    output_length = max(data[feature_of_interest].map(len))
    data[feature_of_interest]= data[feature_of_interest].apply(lambda array: extend_list(list(array), max_length=output_length, filling_value=1)) 
    #shuffle
    data = data.sample(len(data))
    data.reset_index(inplace=True, drop=True)

    data["id"] = data.index
    data.rename({"video_id": "target", feature_of_interest:0}, axis=1, inplace=True)

    y = data["target"]
    data.drop("target", axis = 1, inplace = True )

    # Explode the 'measurements' column
    exploded_df = data.explode(0)

    # Add the 'time' column
    exploded_df['time'] = exploded_df.groupby('id').cumcount()

    # Reset the index
    exploded_df.reset_index(drop=True, inplace=True)

    # Reorder the columns
    data = exploded_df[['id', 'time', 0]]
    data[0] = data[0].map(float)

    # Print the resulting dataframe
    return(data, y)

## Extract Features

In [24]:
def extract_features_from_df(df, target, features_fn):
    start_time = datetime.now()

    with warnings.catch_warnings():
        warnings.simplefilter( "ignore" )
        f = extract_features( df, column_id = "id", column_sort = "time")

    impute( f )
    assert f.isnull().sum().sum() == 0

    f['y'] = target

    f = f.sample(len(f))

    f.to_csv( features_fn, index = None )

    end_time = datetime.now()
    exicution_time = end_time  - start_time
    m = int(exicution_time.seconds / 60)
    s = exicution_time.seconds % 60
    print(f"Features extracted and saved to: ", features_fn)
    print(f"Duration: {m}:{s}")

## Feature Selection

In [25]:
def select_features_from_file(features_fn, train_fn, test_fn, validation_proportion, fdr_level ):
    start_time = datetime.now()

    print("loading {}".format( features_fn ))
    features = pd.read_csv( features_fn )

    validation_split_i = int(len(features)*(1-validation_proportion))

    train_x = features.iloc[:validation_split_i].drop( 'y', axis = 1 )
    test_x = features.iloc[validation_split_i:].drop( 'y', axis = 1 )

    train_y = features.iloc[:validation_split_i].y
    test_y = features.iloc[validation_split_i:].y

    print("selecting features...")
    train_features_selected = select_features( train_x, train_y, fdr_level = fdr_level )

    print("selected {} features.".format( len( train_features_selected.columns )))

    train = train_features_selected.copy()
    train['y'] = train_y

    test = test_x[ train_features_selected.columns ].copy()
    test['y'] = test_y


    print("saving {}".format( train_fn ))
    train.to_csv( train_fn, index = None )

    print("saving {}".format( test_fn ))
    test.to_csv( test_fn, index = None )

    end_time = datetime.now()
    exicution_time = end_time  - start_time
    m = int(exicution_time.seconds / 60)
    s = exicution_time.seconds % 60

    print(f"Duration: {m}:{s}")

## Training And Evaluating

In [26]:
def key_encoder(obj):
    """
    Custom JSON encoder function to convert numeric keys represented as strings to actual numbers.
    """
    if isinstance(obj, str):
        if obj.isnumeric():
            return int(obj)
    return obj

In [27]:
def train(train_fp, evaluation_folder):
    start_time = datetime.now()
    train = pd.read_csv( train_fp )
    x_train = train.drop( 'y', axis = 1 ).values
    y_train = train.y.values

    # define the multinomial logistic regression model
    model = LogisticRegression(multi_class='multinomial', solver='lbfgs',  max_iter=2000)
    print("Model created.")

    # define the model evaluation procedure
    scoring = ['precision_macro', 'recall_macro', 'accuracy', 'f1_macro']
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)


    scaler = StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    print("Begin validation")
    # evaluate the model and collect the scores
    train_scores = cross_validate(model, x_train_scaled, y_train, scoring=scoring, cv=cv, n_jobs=-1)

    print("Validation complete")
    train_scores_dict = {str.replace(k, "test_", ""):{
    "mean": np.mean(train_scores[k]),
    "std": np.std(train_scores[k])
    }
        for k in train_scores.keys()
    }
    train_scores_dict

    print("Fitting model")
    model.fit(x_train_scaled, y_train)
    print("Fitting complete")
    train_fn = re.search(r"/Train/(.*)", train_file).group(1)
    model_fn = models_folder+train_fn[:-10]+"-lr-model.joblib"
    joblib.dump(model, model_fn)
    print("Model saved as: ", model_fn)

    evaluation_train_fn = evaluation_folder+train_fn[:-10]+"-eval-train.json"

    with open(evaluation_train_fn, "w") as f: 
        json.dump(train_scores_dict, f, indent=4, sort_keys=True, default=key_encoder)
    
    end_time = datetime.now()
    exicution_time = end_time  - start_time
    m = int(exicution_time.seconds / 60)
    s = exicution_time.seconds % 60
    print(f"Training complete in: {m}:{s}")
    return model, pd.DataFrame(train_scores_dict)

In [28]:
def evaluate(model, test_fp, evaluation_folder, vids_to_nums_map): 
    test = pd.read_csv( test_fp )
    x_test = test.drop( 'y', axis = 1 ).values
    y_test = test.y.values
    scaler = StandardScaler().fit(x_test)
    x_test_scaled = scaler.transform(x_test)
    predictions = model.predict(x_test_scaled)
    report = classification_report(y_test, predictions, output_dict=True)
    v_ids = list(set(vids_to_nums_map.values()) and set(y_test))
    cm = confusion_matrix(y_test, predictions, labels = v_ids )
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    report["cm"] = cm_normalized.tolist()
    test_fn = re.search(r"/Test/(.*)", test_fp).group(1)
    evaluation_test_fn = evaluation_folder+test_fn[:-9]+"-eval-test.json"
    with open(evaluation_test_fn, "w") as f: 
        json.dump(report, f, indent=4, sort_keys=True, default=key_encoder)
    del report["cm"]
    return pd.DataFrame(report).T

In [1]:
i = 0
num_models = len(to_model_data_files)
total_num_models = len(source_data_files)
last_model_duration = ""
for test_fn in [12]:
    print("Last modeling duration: "+ last_model_duration)
    start_time = datetime.now()

    print("############################################################################")
    print(f"                              Model {i+1+(total_num_models-num_models)}/{total_num_models} ")
    print("############################################################################")

    test_fn = to_model_data_files[i]
    test_fp = binned_data_dir + test_fn
    features_file = features_folder+test_fn[:-8]+"-"+feature_of_interest+"-features.csv"
    train_file =  train_folder+test_fn[:-8]+"-"+feature_of_interest+"-train.csv"
    test_file =  test_folder+test_fn[:-8]+"-"+feature_of_interest+"-test.csv"
    df, target = transform_data(test_fp, feature_of_interest)
    extract_features_from_df(df, target, features_file)
    select_features_from_file(features_file, train_file, test_file, validation_proportion, fdr_level )

    model, evaluation_train = train(train_file, evaluation_folder)
    display(evaluation_train)
    del model
    train_fn = re.search(r"/Train/(.*)", train_file).group(1)
    model = loaded_model = joblib.load(models_folder+train_fn[:-10]+"-lr-model.joblib")
    evaluation_test = evaluate(model, test_file, evaluation_folder, vids_to_nums_map)
    display(evaluation_test)
    i+=1
    end_time = datetime.now()
    exicution_time = end_time  - start_time
    m = int(exicution_time.seconds / 60)
    s = exicution_time.seconds % 60
    last_model_duration = f"{m}:{s}"
    clear_output()

NameError: name 'to_model_data_files' is not defined

***

In [30]:
#v_ids = list(set(vids_to_nums_map.values()) and set(y_test))
#cm = confusion_matrix(y_test, predictions, labels = v_ids )
#cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#plt.figure(figsize=(10, 10)) 
#ax = sns.heatmap(cm_normalized, annot=False, cmap='Blues', xticklabels=v_ids, yticklabels=v_ids)

In [31]:
# # predict probabilities with a multinomial logistic regression model
# from sklearn.datasets import make_classification
# from sklearn.linear_model import LogisticRegression
# # define dataset
# X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, n_classes=3, random_state=1)
# # define the multinomial logistic regression model
# model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# # fit the model on the whole dataset
# model.fit(X, y)
# # define a single row of input data
# row = [1.89149379, -0.39847585, 1.63856893, 0.01647165, 1.51892395, -3.52651223, 1.80998823, 0.58810926, -0.02542177, -0.52835426]
# # predict a multinomial probability distribution
# yhat = model.predict_proba([row])
# # summarize the predicted probabilities
# print('Predicted Probabilities: %s' % yhat[0])

In [32]:
# define the multinomial logistic regression model with a default penalty
# LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', C=1.0)

In [33]:
# # get the dataset
# def get_dataset():
# 	X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=1, n_classes=3)
# 	return X, y

# # get a list of models to evaluate
# def get_models():
# 	models = dict()
# 	for p in [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0]:
# 		# create name for model
# 		key = '%.4f' % p
# 		# turn off penalty in some cases
# 		if p == 0.0:
# 			# no penalty in this case
# 			models[key] = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='none')
# 		else:
# 			models[key] = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', C=p)
# 	return models

# # evaluate a give model using cross-validation
# def evaluate_model(model, X, y):
# 	# define the evaluation procedure
# 	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# 	# evaluate the model
# 	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# 	return scores

# # define dataset
# X, y = get_dataset()
# # get the models to evaluate
# models = get_models()
# # evaluate the models and store results
# results, names = list(), list()
# for name, model in models.items():
# 	# evaluate the model and collect the scores
# 	scores = evaluate_model(model, X, y)
# 	# store the results
# 	results.append(scores)
# 	names.append(name)
# 	# summarize progress along the way
# 	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# # plot model performance for comparison
# pyplot.boxplot(results, labels=names, showmeans=True)
# pyplot.show()