In [None]:
# Code from guided project on Coursera: Simulating Time Series Data by Parallel Computing in Python

In [1]:
import sys
from os import path

import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde

from multiprocessing import Pool

import time

In [2]:
# Task 1 - define a rate calculation function
def find_rate(data=None):
    
    if data is None:
      print("Error. Please pass time-dependent data.")
      return None
    try:
      # fill in here
      index = data.iloc[:,0].values.astype(np.i64)//10**9 # floor division
      data['index_col'] = index
    
      values = pd.Series(data.iloc[:,1].values, index=index)
      rate = values.diff()/(data['index_col'].diff().values)
      data['rate'] = rate.values
        
      data.fillna(0, inplace=True)
      data['rate'] = data['rate'].round(5)
      
    except AttributeError as error:
      print("Attribute error:" + error)
    except:
      print("Unexpected error:" + sys.exc_info()[0])
      raise

In [None]:
# Task 2 - Calculate rate of time-dependent parameters. If restarting the script, ensure Runtime is set to "TPU". Go to Runtime -> Change runtime type -> Select TPU.
FILENAMES = ["paramX1wrtTime.csv",
             "paramX2wrtTime.csv", # additional data file (add your own)
             ]

for file in FILENAMES:
    # split the file path to get the parameter name
    name = file.split('wrt')

    # print parameter name
    print(name[0])

    # get full file path
    filepath = path.join("./", file)

    # open the file
    data_file = pd.read_csv(filepath, header=0, index_col=False)

    # fill in here
    print(data_file.dtypes)
    data_file.iloc[:,0] = pd.to_datetime(data_file.iloc[:,0])
    print("-----")
    print(data_file.dtypes)
    find_rate(data_file)
    print(data_file.head(5))
    data_file.drop(data_file.columns[[0,2]], axis=1, inplace=True) # drop two columns
                                                                  # becayuse inplace=True
                                                                  # after every column
                                                                  # index updates after every column deletion
                                                                  # so, with index = 0 and 2, we delete
                                                                  # the first and fourth column
    out_path = path.join(path.join("./"), name[0] + "_rate.csv")
    data_file.to_csv(out_path)


In [3]:
!ls

multiprocessing_time_series_example.ipynb
original.csv
parallel_code_empty.ipynb
paramX2wrtTime.csv
xampp-linux-x64-7.4.8-0-installer.run


In [None]:
# Task 3 - Generate rate samples for each column. If restarting the script, ensure Runtime is set to "TPU". Go to Runtime -> Change runtime type -> Select TPU.

# utilize the original calculated rates to simulate or generate new rate values. Use these rate values
# to simulate real world samples. The new rate values will follow the same distribution as the original ones.

filenames = ["paramX1_rate.csv",
             "paramX2_rate.csv", # add more files here
            ] 

# initialise variables
random_rate = []
samples = 10000 # number of rate samples required
rate_df = pd.DataFrame()
pool = Pool(processes=1) # define number of parallel processes required

# Generate time series
for file in filenames:

    # split filename to get parameter name
    name = file.split('_')

    # print parameter name
    print(name[0])

    # get full file path
    filepath = path.join("./", file)

    # read file
    data_file = pd.read_csv(filepath, header=0, index_col=False)

    # convert first column to 'datetimens' datatype
    data_file.iloc[:, 0] = pd.to_datetime(data_file.iloc[:, 0])

    # store 'rate' column as 'data'
    data = data_file['rate']

    # get random samples
    values = np.random.rand(samples)

    # fill in here
    x_grid = pool.starmap(np.linspace, ((min(data), mac(data), samples), )
    kde = pool.starmap(gaussian_kde, ((data, "scott"), )) # evaluate probability of kde at each point in x_grid
    kdepdf = kde[0].evaluate(x_grid[0])
    #get the cumulative sum of the probability densities and divide by the total density
    cdf = np.cumsum(kdepdf)
    cdf = cdf/cdf[-1]

    # find the indices in cdf where elements of values should be inserted to maintain
    # the order of the cdf, hence maintaining the distribution. (The values object contains 10 000
    # random samples)

    value_bins = pool.starmap(np.searchsorted, ((cdf, values), ))

    # pick the values of indices in x_grid, and store them in a random rate object.
    random_rate = x_grid[0][tuple(value_bins)]

    # convert random_rate to an array, transpose it, and convert it to a list, before storing it as
    # a dataframe
    tempd_df = pd.DataFrame(np.asarray(random_rate).T.tolist(), columns=[name[0]])
    rate_df = rate_df.join(temp_df, how='outer')
    

pool.terminate() # stop all parallel processes

# write 'rate_df' to file
rate_df.to_csv("simulated_rate.csv", index=False)


In [None]:
# Task 4+5 - If restarting the script, ensure Runtime is set to "TPU". Go to Runtime -> Change runtime type -> Select TPU.
def simulate_corr(temp):
    """
    Simulate original dataset, using rates and correlation. Pick a random
    column (X), simulate other column values based on rate of change of X and
    correlation between X and other columns. Repeat for samples count.
    """
    global sdev, rate, orig_df, freq, samples
    
    data = orig_df
    colRange = len(data.columns) # number of columns
    start_vals = data.mean() # get mean
    data_corr = data.corr() # get correlation between all columns
    colms = data.columns.values.tolist() # list of columns
    temp_df = pd.DataFrame(columns=colms) # empty df intialised with columns

    # fill in here



In [None]:
# Task 6 - If restarting the script, ensure Runtime is set to "TPU". Go to Runtime -> Change runtime type -> Select TPU.
rate = pd.read_csv("./simulated_rate.csv", header=0, index_col=False)
print("Read rate file\n")
concat_df, backup_df = pd.DataFrame(), pd.DataFrame()
orig_df = pd.read_csv("./original.csv", index_col=False)
print("Read original df\n")

# fill in here