In [2]:
import numpy as np
import pandas as pd
from sklearn import ensemble

In [7]:

# This Data is made by Tensor document
final_frame = pd.read_csv("D:/myPython/IGL/tensor_frame.csv").drop(["Unnamed: 0"], axis = "columns")
final_frame.head()


Unnamed: 0,Street_Bolck,Date,Time,Occupancy,Price
0,01ST ST 200,2011-04-01,6,0.0,
1,01ST ST 200,2011-04-01,7,1.0,3.497409
2,01ST ST 200,2011-04-01,8,3.0,3.498704
3,01ST ST 200,2011-04-01,9,9.0,3.341926
4,01ST ST 200,2011-04-01,10,12.0,3.093837


In [8]:

# Cusum Filter

def istar_CUSUM(ser, h):
    S_pos = S_neg = 0
    istar = []
    yminusEy = np.diff(ser)
    n = len(yminusEy)
    for i in range(n):
        S_pos = max(0, S_pos + yminusEy[i])
        S_neg = min(0, S_neg + yminusEy[i])
        if(max(S_pos, -S_neg) >= h):
            istar.append(i+1)
            S_pos = S_neg = 0

    return(istar)



In [10]:

# Metal Labeling
def label_meta(x, events, ptSl):
    t0 = events["t0"]
    t1 = np.minimum(events["t1"], len(x)-1)
    trgt = events["trgt"]
    side = events["side"]
    u = ptSl[0]
    l = ptSl[1]
    out = pd.DataFrame()
    for i in range(len(t0)):
        i_t0 = t0.iloc[i]
        i_t1 = np.min(np.array([t1.iloc[i], len(x)-1]))
        i_x = x[i_t0:(i_t1+1)]
        i_trgt = trgt.iloc[i]
        i_side = side[i]

        if i_side == 0:
            up = i_trgt * u
            lo = i_trgt * l
            isup = (i_x / i_x[0] - 1) >= up
            islo = -(i_x / i_x[0] - 1) >= lo

            if sum(isup) > 0:
                T_up = min(np.where(isup)[0])
            else:
                T_up = np.inf

            if sum(islo) > 0:
                T_lo = min(np.where(islo)[0])
            else:
                T_lo = np.inf

        elif i_side == 1:
            up = i_trgt * u
            isup = (i_x / i_x[1]-1) >= up
            if sum(isup) > 0:
                T_up = min(np.where(isup)[0])
            else:
                T_up = np.inf

            T_lo = np.inf

        else:
            lo = i_trgt * l
            islo = -(i_x / i_x[1]-1) >= lo
            T_up = np.inf
            if sum(islo) > 0:
                T_lo = min(np.where(islo)[0])
            else:
                T_lo = np.inf

        ret = i_x[int(min(T_up, T_lo, len(i_x)-1))] / i_x[0] - 1
        rst = [T_up, T_lo, len(i_x), ret]

        out = out.append([rst])

    out.columns = ["T_up", "T_lo", "t1", "ret"]
    return out



In [11]:
# Filter + Feature + Label + Model

def ML_Block(df):
    sample_price = df["Price"].dropna()
    sample_price = sample_price[(sample_price >= 0.1) & (sample_price <= 10)]
    sample_price[sample_price >= 6] = 6
    sample_price[sample_price <= 0.25] = 0.25

    i_star = istar_CUSUM(sample_price.values, h=2)

    if len(i_star) == 0:
        return np.nan

    if i_star[-1] == (len(sample_price) - 1):
        i_star = i_star[0:(len(i_star)-1)]

    n_Event = len(i_star)

    # Feature Matrix
    iTmp = i_star.copy()
    iTmp.insert(0, 0)
    Feature_Matrix = pd.DataFrame()
    for i in range(n_Event):
        winTmp = sample_price.values[(iTmp[i] + 1):(iTmp[i + 1]+1)]
        last = winTmp[len(winTmp)-1]
        sd = np.std(winTmp)
        avg = np.mean(winTmp)
        max = np.max(winTmp)
        min = np.min(winTmp)
        median = np.median(winTmp)
        Feature_Matrix = Feature_Matrix.append([[last, avg, sd, median,  max, min]])

    Feature_Matrix.columns = ["last", "avg", "sd", "median", "max", "min"]

    
    # Labeling
    events = pd.DataFrame({"t0": np.array(i_star) + 1,
                           "t1": np.array(i_star) + 200,
                           "trgt": [0.8] * n_Event,
                           "side": [0] * n_Event})
    ptSl = [1,1]

    label_meta_out = label_meta(sample_price.values, events, ptSl)
    Y_train = np.array( [0]*n_Event )
    Y_train[label_meta_out["ret"].values >= (events["trgt"]*ptSl[1]).values] = 1  # wrong code

   

    # Model
    rf = ensemble.RandomForestClassifier(n_estimators = 50).fit(Feature_Matrix.values, Y_train)
    pred_class = rf.predict(Feature_Matrix.values)
    accuracy = np.mean( pred_class == Y_train )
    return accuracy


In [13]:

# Application on each Street_Block

result = final_frame.groupby("Street_Bolck").apply(ML_Block)

result.head(20)


Street_Bolck
01ST ST 200     0.993506
01ST ST 300     1.000000
02ND ST 200     0.996144
02ND ST 300     0.999170
02ND ST 400     0.996830
02ND ST 500     0.997338
02ND ST 600     0.995246
02ND ST 700     0.997214
03RD ST 300     1.000000
03RD ST 400     0.972826
03RD ST 500     0.997253
03RD ST 600     0.986885
03RD ST 700     0.994463
03RD ST 800     1.000000
04TH ST 1200    0.969231
04TH ST 1300    1.000000
04TH ST 1400    0.986111
04TH ST 1500    0.979592
04TH ST 1600    1.000000
04TH ST 1700    1.000000
dtype: float64