In [100]:
from collections import OrderedDict

import pandas as pd
import numpy as np

class OHLC:
    def __init__(self, filename):
        # Never input df data which is daily, interval should be in minutes, hours
        self.df = pd.read_csv(filename)

        # Remove unnessary columns
        self.df.drop('6', axis=1, inplace=True)
        self.df.drop('Unnamed: 0', axis=1, inplace=True)

        # Convert string dates to pd.Datetime
        self.df.Date = pd.DatetimeIndex(self.df.Date)

        # interval is in minues
        self.interval = None
        self.days = None
        # This is the number of data points in each day
        self.eachDayRows = None

        # Remove wrong data rows from df
        self.cleanDf()
        self.findInterval()

        # Assert no extra wrong rows remain now after cleanup
        assert self.eachDayRows*self.days == self.df.shape[0]

        self.df.set_index('Date', inplace=True) 
        
        # Remove timezone from timestamp
        self.df.index = [i.replace(tzinfo=None) for i in self.df.index]

        self.df['date1'] = self.df.index

    def toInterval(self, minutes):
        # Resamples df to the minutes input provided
        OHLCV_AGG = OrderedDict((
            ('Open', 'first'),
            ('High', 'max'),
            ('Low', 'min'),
            ('Close', 'last'),
            ('Volume', 'sum'),
        ))
        freq_minutes = pd.Series({
            "1T": 1,
            "5T": 5,
            "10T": 10,
            "15T": 15,
            "30T": 30,
            "1H": 60,
            "2H": 60*2,
            "4H": 60*4,
            "8H": 60*8,
            "1D": 60*24,
            "1W": 60*24*7,
            "1M": np.inf,
        })

        freq = freq_minutes.where(freq_minutes >= minutes).first_valid_index()
        return self.df.resample(freq, label='left', closed="left", origin='start').agg(OHLCV_AGG).dropna()

    def between_time(self, start, end):
        # if string convert to pd.to_datetime("2015-03-02T09:17:00")
        return self.df[(self.df.index >= start) & (self.df.index <= end)]

    def findInterval(self):
        mp = dict()
        i = 0
        dayCnt = 0
        while i < self.df.shape[0]:
            j = i+1
            dayCnt += 1
            while j < self.df.shape[0] and self.df.Date[i].day == self.df.Date[j].day:
                interval = (self.df.Date[j] - self.df.Date[j-1]).total_seconds()
                if interval in mp:
                    mp[interval]+=1
                else:
                    mp[interval]=1
                j+=1
            i = j

        # Assert only one type of interval should exist
        assert(len(mp)) == 1

        # Dividing the interval by 60 to convert seconds to minutes
        self.interval = list(mp.keys())[0]/60
        self.days = dayCnt

    def cleanDf(self):
        def allIntervals():
            # Return a map with 
            # keys -> number of intervals in a day
            # values -> number of days with this interval
            mp = dict()
            i = 0
            while i < self.df.shape[0]:
                j = i
                while j < self.df.shape[0] and self.df.Date[i].day == self.df.Date[j].day:
                    j+=1
                if j-i in mp:
                    mp[j-i].append((i, j))
                else:
                    mp[j-i]=[(i, j)]
                i = j
            return mp
        
        mp = allIntervals()

        # Drop data which is not a most occurring number of interval in a day
        maxlen = 0
        for i in mp.values():
            maxlen = max(maxlen, len(i))
        
        # Store to be deleted indices
        rmIndx = []
        for i in mp.values():
            if len(i) < maxlen:
                for pair in i:
                    rmIndx.extend(np.arange(pair[0], pair[1]))
        
        self.df.drop(rmIndx, axis=0, inplace=True)
        self.df.index = np.arange(0, self.df.shape[0])

        # After cleanup only one type of interval should remain
        mp = allIntervals()
        assert len(mp) == 1
        self.eachDayRows = list(mp)[0]

In [101]:
from pandas.tseries.frequencies import to_offset

o = OHLC('../../data/reliance.csv')

In [102]:
mins = 1
df = o.toInterval(mins)

In [104]:
df['date'] = df.index
df

Unnamed: 0,Open,High,Low,Close,Volume,date
2015-03-02 09:15:00,430.65,430.90,429.85,430.10,30634,2015-03-02 09:15:00
2015-03-02 09:16:00,429.15,429.45,428.60,428.95,21938,2015-03-02 09:16:00
2015-03-02 09:17:00,428.95,429.90,428.95,429.45,22316,2015-03-02 09:17:00
2015-03-02 09:18:00,429.75,430.40,429.75,430.00,18280,2015-03-02 09:18:00
2015-03-02 09:19:00,430.00,430.35,429.75,429.95,12869,2015-03-02 09:19:00
...,...,...,...,...,...,...
2015-04-09 15:25:00,443.25,443.70,443.25,443.60,71893,2015-04-09 15:25:00
2015-04-09 15:26:00,443.65,444.10,443.45,443.55,100039,2015-04-09 15:26:00
2015-04-09 15:27:00,443.50,444.60,443.50,444.40,85927,2015-04-09 15:27:00
2015-04-09 15:28:00,444.70,445.20,444.40,444.60,59032,2015-04-09 15:28:00


In [92]:
df[(df.index >= pd.to_datetime("2015-03-02T09:17:00")) & (df.index <= pd.to_datetime("2015-03-02T09:22:00"))]

Unnamed: 0,Open,High,Low,Close,Volume
2015-03-02 09:17:00,428.95,429.9,428.95,429.45,22316
2015-03-02 09:18:00,429.75,430.4,429.75,430.0,18280
2015-03-02 09:19:00,430.0,430.35,429.75,429.95,12869
2015-03-02 09:20:00,429.8,430.7,429.6,429.75,60236
2015-03-02 09:21:00,429.75,430.3,429.55,430.05,27708
2015-03-02 09:22:00,430.25,430.8,429.9,430.45,42128


In [99]:
import datetime
df.index >= datetime.datetime.fromtimestamp(1662408786115/1000.0)

array([False, False, False, ..., False, False, False])