In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path = "/kaggle/input/flight-delay-prediction/masterData.csv"

In [None]:
target_variable = 'DepDelay'

df = pd.read_csv(path, index_col=False, nrows=50)
df.columns

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



# Assuming 'DepDelay' is the target variable
target_variable = 'DepDelay'

df = pd.read_csv(path, index_col=False)

df['prcp'].fillna(0, inplace=True)
df.dropna(inplace=True)
df[target_variable] = df[target_variable].apply(lambda x: 0 if x < 0 else x)

# Extracting independent variables (features) and the target variable
X = df.drop(target_variable, axis=1)
y = df[target_variable]

# Separating numerical and categorical columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Creating transformers for numerical and categorical columns
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

# Creating a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Convert data to PyTorch tensors and move to the GPU
X_tensor = torch.tensor(preprocessor.fit_transform(X).toarray(), dtype=torch.float32).to(device)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1).to(device)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Define the neural network model with dropout
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.dropout = nn.Dropout(0.5)  # Adjust the dropout rate as needed
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Instantiate the model and move it to the GPU
input_size = X_train.shape[1]
model = NeuralNetwork(input_size).to(device)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the neural network
epochs = 100
batch_size = 64

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}/{epochs}, Loss: {loss.item()}')

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test)
    print(f'Test Loss: {test_loss.item()}')

# Convert predictions back to CPU for further analysis if needed
test_outputs_cpu = test_outputs.cpu().numpy()

# Calculate and print the metrics on the test set
mse = mean_squared_error(y_test.cpu().numpy(), test_outputs_cpu)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test.cpu().numpy(), test_outputs_cpu)

print(f'Mean Squared Error on Test Set: {mse}')
print(f'Root Mean Squared Error on Test Set: {rmse}')
print(f'Mean Absolute Error on Test Set: {mae}')

y_test_flatten = y_test.cpu().numpy().flatten()

# Plotting the scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test_flatten, y=test_outputs_cpu.flatten())
plt.xlabel('Actual DepDelay')
plt.ylabel('Predicted DepDelay')
plt.title('Neural Network - Actual vs Predicted DepDelay')
plt.savefig('neural_network_scatter_plot.png', bbox_inches='tight')
plt.show()


In [None]:
df.columns