In [31]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [102]:
def load_preprocess_data(file_path):
    data = pd.read_csv(file_path, index_col=0)
    data.columns = range(1, len(data.columns) + 1)
    expression_sums = data.iloc[:,1:].sum(axis=1)

    filtered_data = data[expression_sums<3000]
    filtered_data = data[expression_sums>0]
    return filtered_data


class GeneExpressionDataset(torch.utils.data.Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return len(self.features)
        
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]


class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(3, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)  

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        
        return x

class CNN1D(nn.Module):
    def __init__(self):
        super(CNN1D, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=2, stride=1)
        self.relu1 = nn.ReLU()
        
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=2, stride=1)
        self.relu2 = nn.ReLU()

        self.fc = nn.Linear(32 * 2, 1) 
        
    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension: (batch_size, 1, sequence_length)
        x = self.relu1(self.conv1(x))
        x = self.relu2(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc(x)
        return x



def sliding_window(data, window_size):
    features = []
    targets = []
    for i in range(data.shape[1]-window_size):
        features.append(data[:,i:i+window_size])
        targets.append(data[:,i+window_size])
    return np.array(features), np.array(targets)


# Data preparation
def prepare_data(data, window_size, train_len):
    all_data = data.values[:,1:train_len+1]
    X, y = sliding_window(all_data, window_size)
    X = X.astype(float).reshape(-1, window_size)
    y = y.astype(float).reshape(-1, 1)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test


def create_dataloaders(X_train, X_val, X_test, y_train, y_val, y_test):
    X_train_tensor, y_train_tensor = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32)
    X_val_tensor, y_val_tensor = torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32)
    
    train_dataset = GeneExpressionDataset(X_train_tensor, y_train_tensor)
    val_dataset = GeneExpressionDataset(X_val_tensor, y_val_tensor)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    if X_test is not None and y_test is not None:
        X_test_tensor, y_test_tensor = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)
        test_dataset = GeneExpressionDataset(X_test_tensor, y_test_tensor)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
        return train_loader, val_loader, test_loader
    else:
        return train_loader, val_loader



def train_model(model, train_loader, val_loader, num_epochs, criterion, optimizer, device):
    for epoch in range(num_epochs):
        model.train()
        for batch_features, batch_targets in train_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
            outputs = model(batch_features)
            loss = criterion(outputs, batch_targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_features, batch_targets in val_loader:
                batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
                outputs = model(batch_features)
                loss = criterion(outputs, batch_targets)
                val_loss += loss.item()
        val_loss /= len(val_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}')


def test_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch_features, batch_targets in test_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
            outputs = model(batch_features)
            loss = criterion(outputs, batch_targets)
            total_loss += loss.item()
    average_test_loss = total_loss / len(test_loader)
    print(f"Average Test Loss: {average_test_loss:.4f}")

def get_prediction(input_data, model, device):
    input_tensor = torch.tensor(input_data, dtype=torch.float32).to(device)
    with torch.no_grad():
        prediction = model(input_tensor)
    return prediction.cpu().numpy().reshape(-1, 1)

def generate_prediction(predict_nums):
    col_numbers = list(range(1, train_len + 1))
    all_data = data.values[:,:train_len]
    input_data = all_data[:,train_len-window_size:train_len]
    input_data = input_data.astype(np.float32)
    predictions = []

    for _ in range(predict_nums):
        pred = get_prediction(input_data, model, device)
        predictions.append(pred)
        input_data = np.concatenate((input_data[:, 1:], pred), axis=1)

    all_predictions = np.concatenate(predictions, axis=1)

    extended_data = np.concatenate((all_data, all_predictions), axis=1)

    last_col_number = train_len
    predicted_cols = ["predicted_" + str(last_col_number + i) for i in range(1, predict_nums + 1)]
    columns = [str(num) for num in col_numbers] + predicted_cols

    extended_df = pd.DataFrame(extended_data, index=data.index, columns=columns)
    return extended_df

In [103]:
def pipeline(filename, pred_len)
    data = load_preprocess_data(filename)
    train_len = 5
    window_Size = 3
    X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(data, window_size, train_len)
    train_loader, val_loader, test_loader = create_dataloaders(X_train, X_val, X_test, y_train, y_val, y_test)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CNN1D().to(device)
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    train_model(model, train_loader, val_loader, num_epochs=40, criterion=criterion, optimizer=optimizer, device=device)
    test_model(model, test_loader, criterion=criterion, device=device)
    results = generate_prediction(pred_len)
    results.to_csv('CNN_FB_PRED.csv')
    
pipeline('fbfiltered.csv', 2)

Epoch [1/40], Training Loss: 7.1016, Validation Loss: 8.9654
Epoch [2/40], Training Loss: 6.7450, Validation Loss: 8.2399
Epoch [3/40], Training Loss: 7.1498, Validation Loss: 8.1713
Epoch [4/40], Training Loss: 8.8702, Validation Loss: 9.9088
Epoch [5/40], Training Loss: 7.1076, Validation Loss: 8.0427
Epoch [6/40], Training Loss: 6.7078, Validation Loss: 7.9278
Epoch [7/40], Training Loss: 6.4937, Validation Loss: 7.9550
Epoch [8/40], Training Loss: 6.4882, Validation Loss: 8.2801
Epoch [9/40], Training Loss: 6.6801, Validation Loss: 7.9369
Epoch [10/40], Training Loss: 6.3999, Validation Loss: 7.9668
Epoch [11/40], Training Loss: 6.4378, Validation Loss: 7.9420
Epoch [12/40], Training Loss: 8.6744, Validation Loss: 9.5872
Epoch [13/40], Training Loss: 6.9954, Validation Loss: 8.0626
Epoch [14/40], Training Loss: 6.5237, Validation Loss: 7.9037
Epoch [15/40], Training Loss: 6.4539, Validation Loss: 7.9047
Epoch [16/40], Training Loss: 6.3987, Validation Loss: 7.9449
Epoch [17/40], Tr

In [104]:
results = generate_prediction(2)
results.to_csv('CNN_FB_PRED.csv')

In [105]:
data = load_preprocess_data('fbfiltered.csv')
train_len = 5
window_size = 3
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(data, window_size, train_len)
train_loader, val_loader, test_loader = create_dataloaders(X_train, X_val, X_test, y_train, y_val, y_test)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MLP().to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
train_model(model, train_loader, val_loader, num_epochs=40, criterion=criterion, optimizer=optimizer, device=device)
test_model(model, test_loader, criterion=criterion, device=device)


Epoch [1/40], Training Loss: 6.7417, Validation Loss: 8.0485
Epoch [2/40], Training Loss: 6.8809, Validation Loss: 8.5778
Epoch [3/40], Training Loss: 7.2232, Validation Loss: 8.5589
Epoch [4/40], Training Loss: 6.6979, Validation Loss: 7.9980
Epoch [5/40], Training Loss: 6.1280, Validation Loss: 8.1796
Epoch [6/40], Training Loss: 6.4408, Validation Loss: 7.9840
Epoch [7/40], Training Loss: 6.2594, Validation Loss: 8.0282
Epoch [8/40], Training Loss: 6.7607, Validation Loss: 7.9993
Epoch [9/40], Training Loss: 6.4251, Validation Loss: 8.2287
Epoch [10/40], Training Loss: 6.1788, Validation Loss: 7.9834
Epoch [11/40], Training Loss: 6.9092, Validation Loss: 7.8758
Epoch [12/40], Training Loss: 6.2435, Validation Loss: 7.9399
Epoch [13/40], Training Loss: 6.4791, Validation Loss: 7.8440
Epoch [14/40], Training Loss: 6.4375, Validation Loss: 7.8922
Epoch [15/40], Training Loss: 6.8956, Validation Loss: 8.5093
Epoch [16/40], Training Loss: 6.2433, Validation Loss: 7.8325
Epoch [17/40], Tr

In [106]:
results = generate_prediction(2)
results.to_csv('MLP_FB_PRED.csv')

In [96]:
# Linear Regression Pipeline
from sklearn.linear_model import LinearRegression
data = load_preprocess_data('fbfiltered.csv')
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(data, window_size, train_len)
model = LinearRegression()
model.fit(X_train, y_train)

def get_prediction_ML(input_data, model):
    prediction = model.predict(input_data)
    return prediction.reshape(-1, 1)

def generate_prediction_ML(predict_nums):
    col_numbers = list(range(1, train_len + 1))
    all_data = data.values[:,:train_len]
    input_data = all_data[:,train_len-window_size:train_len]
    input_data = input_data.astype(np.float32)
    predictions = []

    for _ in range(predict_nums):
        pred = get_prediction(input_data, model)
        predictions.append(pred)
        input_data = np.concatenate((input_data[:, 1:], pred), axis=1)

    all_predictions = np.concatenate(predictions, axis=1)

    extended_data = np.concatenate((all_data, all_predictions), axis=1)

    last_col_number = train_len
    predicted_cols = ["predicted_" + str(last_col_number + i) for i in range(1, predict_nums + 1)]
    columns = [str(num) for num in col_numbers] + predicted_cols

    extended_df = pd.DataFrame(extended_data, index=data.index, columns=columns)
    return extended_df

results = generate_prediction_ML(2)
results.to_csv('Linear_FB_PRED.csv')

In [97]:
from sklearn.svm import SVR
data = load_preprocess_data('fbfiltered.csv')
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(data, window_size, train_len)

model = SVR(C=1.0, kernel='rbf')
model.fit(X_train, y_train.ravel())
results = generate_prediction_ML(2)
results.to_csv('SVR_FB_PRED.csv')

In [98]:
from sklearn.ensemble import RandomForestRegressor
data = load_preprocess_data('fbfiltered.csv')
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(data, window_size, train_len)
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train.ravel())
results = generate_prediction_ML(2)
results.to_csv('RFR_FB_PRED.csv')

In [99]:
from sklearn.neighbors import KNeighborsRegressor
data = load_preprocess_data('fbfiltered.csv')
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(data, window_size, train_len)
model = KNeighborsRegressor(n_neighbors=5)
model.fit(X_train, y_train.ravel())
results = generate_prediction_ML(2)
results.to_csv('KNN_FB_PRED.csv')