# create preprocessed data for training MTL model


## import libraries

In [1]:
import requests as req
import pandas as pd
import numpy as np

from similarity import Similarity
from normalization import Normalization

## 

In [2]:
def calculate_density(df, top_k):
    arr = df.values.T.copy() # canvert to numpy array
    mask = arr != 0 # Convert all integer to boolean
    congestion_density = mask.sum(axis=1) / mask.shape[1] # Calculate density
    indecies = congestion_density.argsort()[::-1] # Sort by descending order
    return indecies[:top_k]

## load data
1. Read files
2. Get intersection of two time series

In [3]:
onehr_file = "../data/congestionlength_for_hours.csv"
fivemin_file = "../data/5MinFukushimaTrafficDataFrame.csv"

# Read data
onehr_df = pd.read_csv(onehr_file)
fivemin_df = pd.read_csv(fivemin_file)

# Convert timestamp to datetime
onehr_df["timestamp"] = pd.to_datetime(onehr_df["timestamp"])
fivemin_df["timestamp"] = pd.to_datetime(fivemin_df["timestamp"])

# Set timestamp as index
onehr_df = onehr_df.set_index("timestamp")
fivemin_df = fivemin_df.set_index("timestamp")

# Get intersection of columns
common_columns = fivemin_df.columns.intersection(onehr_df.columns)
onehr_df = onehr_df[common_columns]
fivemin_df = fivemin_df[common_columns]

## Create preprocessed data (1hr)
1. Calciulate the density of the time series of all columns
2. Calculate the similarity of the time series of all columns
3. Create preprocessed data

In [4]:
top_k = 10 # Top k nearest neighbors
density_idx_1hr = calculate_density(onehr_df, top_k) # Get top k densest columns
cos_sim = Similarity.cosine_similarity(onehr_df) # Calculate cosine similarity
nearest_1hr = cos_sim.argsort(axis=1)[density_idx_1hr, ::-1][:top_k] # Get top k nearest neighbors
new_df = pd.DataFrame(onehr_df.iloc[:, np.insert(nearest_1hr[0][0:top_k-1], 0, density_idx_1hr[0])]) # Create new dataframe with top k nearest neighbors
new_df = Normalization.min_max_scaling(new_df) # Normalize data
new_df.to_csv("../data/preprocessed1hrData.csv") # Save to csv

## Create preprocessed data (5min)
1. Calciulate the density of the time series of all columns
2. Calculate the similarity of the time series of all columns
3. Create preprocessed data

In [5]:
top_k = 10 # Top k nearest neighbors
density_idx_5min = calculate_density(fivemin_df, top_k) # Get top k densest columns
cos_sim = Similarity.cosine_similarity(fivemin_df) # Calculate cosine similarity
nearest_5min = cos_sim.argsort(axis=1)[density_idx_5min, ::-1][:top_k] # Get top k nearest neighbors
new_df = pd.DataFrame(fivemin_df.iloc[:, np.insert(nearest_5min[0][0:top_k-1], 0, density_idx_5min[0])]) # Create new dataframe with top k nearest neighbors
new_df = Normalization.min_max_scaling(new_df) # Normalize data
new_df.to_csv("../data/preprocessed5minData.csv") # Save to csv