In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [54]:
candles = pd.read_csv('/content/candles.csv')
candles_2 = pd.read_csv('/content/candles_2.csv')
news = pd.read_csv('/content/news.csv')
news_2 = pd.read_csv('/content/news_2.csv')

In [65]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import pickle
import csv
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

# ====== Простая новостная фича: дневной news_count ======
def add_news_count(candles: pd.DataFrame, news: pd.DataFrame | None) -> pd.DataFrame:
    c = candles.copy()
    c['begin'] = pd.to_datetime(c['begin'])
    c['date'] = c['begin'].dt.normalize()
    if news is None or len(news) == 0:
        c['news_count'] = 0.0
        return c.drop(columns=['date'])
    n = news.copy()
    n['publish_date'] = pd.to_datetime(n['publish_date'])
    n['date'] = n['publish_date'].dt.normalize()
    daily = n.groupby('date').size().reset_index(name='news_count')
    out = c.merge(daily, on='date', how='left')
    out['news_count'] = out['news_count'].fillna(0.0)
    return out.drop(columns=['date'])

# ====== Фичи и таргеты ======
FEATS = ['momentum_5','volatility_5','price_range','news_count']

def create_features(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d['begin'] = pd.to_datetime(d['begin'])
    d = d.sort_values(['ticker','begin']).reset_index(drop=True)
    d['momentum_5'] = d.groupby('ticker')['close'].pct_change(5).fillna(0.0)
    r1 = d.groupby('ticker')['close'].pct_change()
    d['volatility_5'] = r1.groupby(d['ticker']).rolling(5, min_periods=1).std().reset_index(level=0, drop=True).fillna(0.0)
    d['price_range'] = ((d['high'] - d['low'])/d['close']).fillna(0.0)
    return d

def create_targets(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    for h in (1,20):
        d[f'target_return_{h}d'] = d.groupby('ticker')['close'].pct_change(h).shift(-h)
    return d

# ====== Обучение ======
def fit(candles: pd.DataFrame, news: pd.DataFrame | None, model_path: str = 'model.pkl') -> None:
    # Обучаем на всей истории, чтобы упростить воспроизведение на стороне организаторов [web:145]
    df = create_features(candles)
    df = add_news_count(df, news)
    df = create_targets(df)
    mask = ~df[['target_return_1d','target_return_20d']].isna().any(axis=1)
    tr = df.loc[mask].reset_index(drop=True)
    if len(tr) == 0:
        raise ValueError("No training rows with valid targets for both horizons 1 and 20.")
    X = tr[FEATS].values
    scaler = StandardScaler().fit(X)
    Xs = scaler.transform(X)
    models = {}
    for h in (1,20):
        y = tr[f'target_return_{h}d'].values
        models[f'reg_{h}'] = LinearRegression().fit(Xs, y)
    train_tickers = sorted(candles['ticker'].astype(str).unique().tolist())
    with open(model_path, 'wb') as f:
        pickle.dump({'features': FEATS, 'scaler': scaler, 'models': models, 'train_tickers': train_tickers}, f)

# ====== Запись CSV ровно в формате примера (заголовок + 19 строк, значения без пробелов) ======
def write_submission_csv(submission: pd.DataFrame, path: str) -> None:
    header = ['ticker'] + [f'p{i}' for i in range(1, 21)]
    sub = submission.copy()[header]
    rows = []
    for _, r in sub.iterrows():
        row = [str(r['ticker'])]
        for i in range(1, 21):
            v = float(r[f'p{i}'])
            row.append(f'{v:.6f}')
        rows.append(row)
    # Пишем CSV с запятой, CRLF, без BOM — строго предсказуемый текстовый формат [web:204][web:159]
    with open(path, 'w', newline='', encoding='utf-8') as f:
        w = csv.writer(f, delimiter=',', lineterminator='\r\n', quoting=csv.QUOTE_MINIMAL)
        w.writerow(header)
        w.writerows(rows)

# ====== Предсказание: одна строка на тикер на выбранную дату (или на последнюю доступную) ======
def predict_on_date(candles: pd.DataFrame,
                    news: pd.DataFrame | None,
                    model_path: str = 'model.pkl',
                    output_path: str = 'submission.csv',
                    cutoff_date: str | None = None) -> pd.DataFrame:
    with open(model_path, 'rb') as f:
        p = pickle.load(f)
    feats, scaler, models, train_tickers = p['features'], p['scaler'], p['models'], p['train_tickers']

    Xf = create_features(candles)
    Xf = add_news_count(Xf, news)

    # Если задана дата — берём максимальную begin ≤ cutoff_date по каждому тикеру; иначе — просто последний день [web:138]
    if cutoff_date is not None:
        cut = pd.to_datetime(cutoff_date)
        Xf = Xf[Xf['begin'] <= cut].copy()
        if len(Xf) == 0:
            # Нет данных до даты — упадём на нулях, сохранив формат
            pass
    last_idx = Xf.groupby('ticker')['begin'].idxmax()
    dfl = Xf.loc[last_idx].reset_index(drop=True)

    # Гарантируем строки для всех тикеров из обучающего списка: если какого-то нет в dfl, заполним нулями [web:109]
    dfl['ticker'] = dfl['ticker'].astype(str)
    have = set(dfl['ticker'].tolist())
    missing = [t for t in train_tickers if t not in have]
    if len(missing):
        # Добавим заглушки в dfl, чтобы итог был ровно по всем ожидаемым тикерам
        pad = pd.DataFrame({'ticker': missing})
        for col in feats + ['begin']:
            pad[col] = np.nan
        dfl = pd.concat([dfl, pad], ignore_index=True)

    # Предикт
    preds = []
    for _, row in dfl.iterrows():
        if pd.isna(row[feats]).any():
            pr1 = 0.0
            pr20 = 0.0
        else:
            Xrow = scaler.transform([row[feats].values])
            pr1  = float(models['reg_1'].predict(Xrow)[0])
            pr20 = float(models['reg_20'].predict(Xrow)[0])
        alphas = np.linspace(0, 1, 20)
        band = np.clip((1 - alphas) * pr1 + alphas * pr20, -0.5, 0.5)
        rec = {'ticker': row['ticker']}
        for i in range(20):
            rec[f'p{i+1}'] = float(band[i])
        preds.append(rec)

    submission = pd.DataFrame(preds)
    # Жёсткий порядок столбцов
    submission = submission[['ticker'] + [f'p{i}' for i in range(1, 21)]]
    # Жёсткая запись CSV как в примере
    write_submission_csv(submission, output_path)
    return submission

# ====== Пример использования ======
# 1) Обучение на history (candles.csv + news.csv)
candles = pd.read_csv('/content/candles.csv')
news = pd.read_csv('/content/news.csv')
fit(candles, news)

# 2A) Предсказание ровно на дату t (например, 2024-09-08) — 1 строка на тикер
candles_2 = pd.read_csv('/content/candles_2.csv')
news_2    = pd.read_csv('/content/news_2.csv')
submission = predict_on_date(candles_2, news_2, model_path='model.pkl', output_path='submission.csv', cutoff_date='2024-09-08')

# 2B) Или предсказание на последнюю доступную дату в candles_2 по каждому тикеру (если cutoff_date=None)
submission = predict_on_date(candles_2, news_2, model_path='model.pkl', output_path='submission.csv', cutoff_date=None)
