In [19]:
import pandas as pd
import numpy as np

In [20]:
df = pd.read_csv("/kaggle/input/stockdata/data.csv").head(50000)
df


Unnamed: 0,symbol,date,open,high,low,close,volume
0,AAL,2014-01-02,25.0700,25.8200,25.0600,25.3600,8998943
1,AAPL,2014-01-02,79.3828,79.5756,78.8601,79.0185,58791957
2,AAP,2014-01-02,110.3600,111.8800,109.2900,109.7400,542711
3,ABBV,2014-01-02,52.1200,52.3300,51.5200,51.9800,4569061
4,ABC,2014-01-02,70.1100,70.2300,69.4800,69.8900,1148391
...,...,...,...,...,...,...,...
49995,FBHS,2014-06-02,39.9700,40.3300,39.6500,40.0000,1096396
49996,FB,2014-06-02,63.2300,63.5900,62.0500,63.0800,35995537
49997,FCX,2014-06-02,34.3200,34.3400,33.9800,34.1200,6484761
49998,FDX,2014-06-02,144.3100,144.9897,143.7200,144.2500,1285542


In [21]:
print(df.dtypes)

symbol     object
date       object
open      float64
high      float64
low       float64
close     float64
volume      int64
dtype: object


In [22]:
df.isnull().sum()

symbol    0
date      0
open      0
high      0
low       0
close     0
volume    0
dtype: int64

In [23]:
df.isnull().sum()

symbol    0
date      0
open      0
high      0
low       0
close     0
volume    0
dtype: int64

In [24]:
def df_to_windowed_df(dataframe, first_date_str, last_date_str, n=3):
    

    first_date = pd.to_datetime(first_date_str)
    last_date = pd.to_datetime(last_date_str)
    
    windowed_data = []

    for company, df_company in dataframe.groupby("symbol"):
        df_company = df_company.copy()
        
        if 'date' not in df_company.columns:
            df_company = df_company.reset_index()
        df_company['date'] = pd.to_datetime(df_company['date'])
        df_company.set_index("date", inplace=True)

        # Filter dates
        df_company = df_company.loc[first_date:last_date]

        # Skip if there's not enough data
        if len(df_company) < n + 1:
            continue  

        df_company["log_return"] = np.log(df_company["close"] + 1e-8).diff()  # Add epsilon        
        feature_columns = [
            "log_return",    
            "volume"         
        ]
        
        df_company = df_company[feature_columns].dropna()

        for i in range(len(df_company) - n):
            past_values = df_company.iloc[i:i+n].values.flatten()
            future_close = df_company.iloc[i + n].name  # Date reference
            windowed_data.append([company] + list(past_values) + [future_close])

    column_names = ["company"] + [f"{col}_t-{i}" 
                   for i in range(n, 0, -1) 
                   for col in feature_columns] + ["target_date"]

    return pd.DataFrame(windowed_data, columns=column_names)

In [25]:
windowed_df = df_to_windowed_df(df, "2014-01-02", "2017-12-29", n=3)



In [26]:
print(windowed_df)

      company  log_return_t-3  volume_t-3  log_return_t-2  volume_t-2  \
0           A        0.012552   1866651.0       -0.004931   1777472.0   
1           A       -0.004931   1777472.0        0.014200   1463208.0   
2           A        0.014200   1463208.0        0.016230   2659468.0   
3           A        0.016230   2659468.0        0.000342   1757647.0   
4           A        0.000342   1757647.0        0.008863   1623330.0   
...       ...             ...         ...             ...         ...   
48055     ZTS       -0.005927   2538724.0        0.004613   2421857.0   
48056     ZTS        0.004613   2421857.0       -0.002963   4522725.0   
48057     ZTS       -0.002963   4522725.0        0.006572   4085453.0   
48058     ZTS        0.006572   4085453.0        0.005553   2395740.0   
48059     ZTS        0.005553   2395740.0       -0.007192   5791017.0   

       log_return_t-1  volume_t-1 target_date  
0            0.014200   1463208.0  2014-01-08  
1            0.016230   265

In [27]:
windowed_df["target_date"] = pd.to_datetime(windowed_df["target_date"])
df["date"] = pd.to_datetime(df["date"])  
windowed_df = windowed_df.rename(columns={"company": "symbol"})

In [28]:
final_df = windowed_df.merge(
    df[["symbol", "date", "close"]],
    left_on=["symbol", "target_date"],  
    right_on=["symbol", "date"],
    how="left"
).drop(columns=["date"]).rename(columns={"close": "target_close"})

In [29]:
print(final_df)

      symbol  log_return_t-3  volume_t-3  log_return_t-2  volume_t-2  \
0          A        0.012552   1866651.0       -0.004931   1777472.0   
1          A       -0.004931   1777472.0        0.014200   1463208.0   
2          A        0.014200   1463208.0        0.016230   2659468.0   
3          A        0.016230   2659468.0        0.000342   1757647.0   
4          A        0.000342   1757647.0        0.008863   1623330.0   
...      ...             ...         ...             ...         ...   
48055    ZTS       -0.005927   2538724.0        0.004613   2421857.0   
48056    ZTS        0.004613   2421857.0       -0.002963   4522725.0   
48057    ZTS       -0.002963   4522725.0        0.006572   4085453.0   
48058    ZTS        0.006572   4085453.0        0.005553   2395740.0   
48059    ZTS        0.005553   2395740.0       -0.007192   5791017.0   

       log_return_t-1  volume_t-1 target_date  target_close  
0            0.014200   1463208.0  2014-01-08         58.39  
1          

In [30]:
X = final_df.drop(columns=['symbol', 'target_date', 'target_close'])
y = final_df['target_close']

train_size = int(0.8 * len(X))
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

In [31]:
import torch
from sklearn.preprocessing import StandardScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_3d = X_train_scaled.reshape(-1, 3, 2)  # 3 time steps, 2 features
X_test_3d = X_test_scaled.reshape(-1, 3, 2)

X_train_tensor = torch.tensor(X_train_3d, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_3d, dtype=torch.float32).to(device)

scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))

y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)  
y_test_tensor = torch.tensor(y_test_scaled, dtype=torch.float32).to(device)  

In [38]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

class CNNLSTM(nn.Module):
    def __init__(self):
        super(CNNLSTM, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(2, 8, kernel_size=2, padding=1),
            nn.BatchNorm1d(8),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        self.lstm = nn.LSTM(8, 16, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(16, 8),
            nn.LayerNorm(8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )
        
        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='relu')
                nn.init.constant_(layer.bias, 0.01)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # [batch, features, time_steps]
        x = self.cnn(x)
        x = x.permute(0, 2, 1)  # [batch, time_steps, features]
        x, _ = self.lstm(x)
        return self.fc(x[:, -1, :]).squeeze(-1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNLSTM().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
criterion = nn.L1Loss()

batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)  
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

for epoch in range(30):
    
    model.train()
    total_loss = 0
    
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    model.eval()
    with torch.no_grad():
        test_preds = model(X_test_tensor)
        test_loss = criterion(test_preds, y_test_tensor)
    
    print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Test Loss: {test_loss.item():.4f}")

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


Epoch 1 | Train Loss: 0.5915 | Test Loss: 0.3465
Epoch 2 | Train Loss: 0.4455 | Test Loss: 0.3325
Epoch 3 | Train Loss: 0.4356 | Test Loss: 0.3292
Epoch 4 | Train Loss: 0.4322 | Test Loss: 0.3276
Epoch 5 | Train Loss: 0.4305 | Test Loss: 0.3265
Epoch 6 | Train Loss: 0.4295 | Test Loss: 0.3259
Epoch 7 | Train Loss: 0.4284 | Test Loss: 0.3253
Epoch 8 | Train Loss: 0.4277 | Test Loss: 0.3249
Epoch 9 | Train Loss: 0.4275 | Test Loss: 0.3248
Epoch 10 | Train Loss: 0.4268 | Test Loss: 0.3240
Epoch 11 | Train Loss: 0.4265 | Test Loss: 0.3240
Epoch 12 | Train Loss: 0.4259 | Test Loss: 0.3237
Epoch 13 | Train Loss: 0.4259 | Test Loss: 0.3234
Epoch 14 | Train Loss: 0.4257 | Test Loss: 0.3236
Epoch 15 | Train Loss: 0.4258 | Test Loss: 0.3233
Epoch 16 | Train Loss: 0.4251 | Test Loss: 0.3231
Epoch 17 | Train Loss: 0.4252 | Test Loss: 0.3232
Epoch 18 | Train Loss: 0.4250 | Test Loss: 0.3230
Epoch 19 | Train Loss: 0.4248 | Test Loss: 0.3230
Epoch 20 | Train Loss: 0.4246 | Test Loss: 0.3230
Epoch 21 