In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle

In [2]:
def to_string(lists):
    s = ""
    for l in lists:
        s = s +"," + ",".join(l)

    return s


def prepare(input_file, output_file):
    result = []
    with open(input_file) as file:
        lines = file.readlines()

        lst = []
        lastDate = None
        for line in lines:
            tokens = line.strip().split(",")
            date, time, op, high, low, close, vol = tokens

            if lastDate == None:
                lst.append(tokens[1:])
                lastDate = date
                continue

            if lastDate != date:
                result.append([lastDate, lst])
                lst = []
                lst.append(tokens[1:])

            if lastDate == date:
                lst.append(tokens[1:])

            lastDate = date

        result.append([lastDate, lst])

    with open(output_file, "w") as output:
        title = "DATE,TIME1,OPEN1,HIGH1,LOW1,CLOSE1,VOL1,TIME2,OPEN2,HIGH2,LOW2,CLOSE2,VOL2,TIME3,OPEN3,HIGH3,LOW3,CLOSE3,VOL3,TIME4,OPEN4,HIGH4,LOW4,CLOSE4,VOL4,TIME5,OPEN5,HIGH5,LOW5,CLOSE5,VOL5,TIME6,OPEN6,HIGH6,LOW6,CLOSE6,VOL6,TIME7,OPEN7,HIGH7,LOW7,CLOSE7,VOL7,TIME8,OPEN8,HIGH8,LOW8,CLOSE8,VOL8,TIME9,OPEN9,HIGH9,LOW9,CLOSE9,VOL9";
        output.write(title)
        output.write("\n")

        for r in result:
            date = str(r[0])
            st = to_string(r[1])

            output.write(date + str(st) + "\n")





input = "SBERP.csv"
output = "SBERP_prepared.csv"
prepare(input, output)

In [3]:
data = pd.read_csv('SBERP_prepared.csv').dropna()

In [4]:
data.head()

Unnamed: 0,DATE,TIME1,OPEN1,HIGH1,LOW1,CLOSE1,VOL1,TIME2,OPEN2,HIGH2,...,HIGH8,LOW8,CLOSE8,VOL8,TIME9,OPEN9,HIGH9,LOW9,CLOSE9,VOL9
0,20120301,110000,75.38,75.45,74.91,74.95,2334800,120000,74.95,75.21,...,75.3,75.12,75.16,754600,190000.0,75.14,75.61,75.12,75.58,2269200.0
1,20120302,110000,76.01,76.35,75.79,76.13,3527600,120000,76.14,76.3,...,76.72,76.03,76.72,4305900,190000.0,76.72,77.48,76.68,77.09,6021500.0
2,20120305,110000,77.09,78.59,77.09,78.39,5464300,120000,78.38,78.4,...,78.11,77.72,78.1,1385100,190000.0,78.1,78.11,77.53,77.59,2659900.0
3,20120306,110000,77.15,77.2,76.71,77.03,2307800,120000,77.03,77.11,...,75.83,75.03,75.07,1457600,190000.0,75.07,75.11,73.8,73.81,5285200.0
4,20120307,110000,74.0,74.2,73.23,73.86,3703100,120000,73.86,74.37,...,74.56,73.9,74.4,2827700,190000.0,74.4,74.78,74.21,74.53,5004100.0


In [5]:
data = shuffle(data)

## Задача:
### Будет ли цена закрытия больше цены открытия ?

In [6]:
data['TARGET'] = data['CLOSE9'] > data['OPEN9']
data.head()

Unnamed: 0,DATE,TIME1,OPEN1,HIGH1,LOW1,CLOSE1,VOL1,TIME2,OPEN2,HIGH2,...,LOW8,CLOSE8,VOL8,TIME9,OPEN9,HIGH9,LOW9,CLOSE9,VOL9,TARGET
1145,20160922,110000,109.44,109.79,109.22,109.41,652000,120000,109.48,109.67,...,110.89,111.02,365800,190000.0,111.09,111.55,111.06,111.47,464000.0,True
714,20150105,110000,37.0,37.86,36.9,37.5,1667300,120000,37.46,38.67,...,38.31,38.49,462400,190000.0,38.49,38.6,38.41,38.59,607900.0,True
89,20120706,110000,65.0,65.5,64.5,64.51,3152700,120000,64.51,64.62,...,63.25,63.38,2786800,190000.0,63.43,63.8,63.31,63.61,1139300.0,True
960,20151225,110000,76.0,76.36,75.33,75.7,1122500,120000,75.62,76.0,...,74.85,75.11,136600,190000.0,75.12,75.3,74.89,75.26,438300.0,True
1506,20180228,110000,229.8,230.05,228.14,228.5,720400,120000,228.52,229.5,...,223.63,225.96,693100,190000.0,225.9,227.11,225.0,227.0,656200.0,True


In [7]:
train_data = data[:1000]
test_data = data[1000:]

In [8]:
train_features = train_data.drop(["DATE", "CLOSE9", "LOW9", "HIGH9", "VOL9",
                                 "TIME1", "TIME2", "TIME3", "TIME4", "TIME5", "TIME6", "TIME7","TIME8","TIME9",
                                 "VOL1", "VOL2", "VOL3", "VOL4", "VOL5", "VOL6", "VOL7","VOL8","VOL9"
                                 ], axis=1)
train_target = train_data["TARGET"]

test_features = test_data.drop(["DATE", "CLOSE9", "LOW9", "HIGH9", "VOL9",
                                 "TIME1", "TIME2", "TIME3", "TIME4", "TIME5", "TIME6", "TIME7","TIME8","TIME9",
                                 "VOL1", "VOL2", "VOL3", "VOL4", "VOL5", "VOL6", "VOL7","VOL8","VOL9"
                               ], axis=1)
test_target = test_data["TARGET"]

In [9]:
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)

In [11]:
acs = []

for i in range(1, 1000, 1):    
    train_features_scaled = shuffle(train_features_scaled)
    lr = LogisticRegression()
    lr.fit(train_features_scaled, train_target)

    test_features_scaled = shuffle(test_features_scaled)
    predictions = lr.predict(test_features_scaled)

    acs.append(accuracy_score(test_target, predictions))

print("Доля правильных ответов:\t{}"
      .format(np.mean(acs)))

Доля правильных ответов:	0.5096851536374534
