<a href="https://colab.research.google.com/github/Anushka108/CNN-for-Tabular-Regression/blob/main/CNN_for_Tabular_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'restaurant-revenue-prediction-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5289783%2F8797164%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240707%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240707T050933Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D20b20d7c0051b9e0312553ea6486e25673baed975fb8f341b863bfb2a5104c98c6950603e7310034a13c3c7f9a7ce2fe9e0bd4217bae7957f9fd939b47adb49e3f4463cb6b0771b7eb2b0109022c16ca349d9c7fc4415f0b243ea2976a7751b37c5c4a0285c7e22df23c75c2cf64636ccf844fdd26bb1bfb482a315d7aab6d5b41f31514d923fc2d566e5e9c80488feb7ac34464551da670898b7113c2cf1d9d6ddb1f147aa58f9e2352079935e35998c424b4a4730cfb9634320a941cb13eb5bbf68ad0949109588a9bed93986d7dcefb64bf469e28a6174df22cf3e8abf3ffaea8a0b6efc9e2fa9a240f5f707708dfb3d4a0ac6c27f7d66450251692c26559'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


### **Importing Libraries**

In [None]:
import pandas as pd # pandas for data manipulation
import matplotlib.pyplot as plt # Matplot.lib for data visualisations
from sklearn.preprocessing import LabelEncoder, StandardScaler # Normalisation model
from sklearn.model_selection import train_test_split # Splitting data to train and test proportions
from sklearn.metrics import mean_squared_error, r2_score # Evaluation metrics
from tensorflow.keras.models import Sequential # importing the base model
from tensorflow.keras.layers import Dense, Dropout # importing hidden layers
from tensorflow.keras.optimizers import Adam # importing optimiser
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau # Callbacck functions to confirm  betterment in model

### **Data loading**

In [None]:
# Loading the data
df = pd.read_csv("/kaggle/input/restaurant-revenue-prediction-dataset/restaurant_data.csv")
df.head()

### **Preprocessing data**

In [None]:
df.isna().sum()

In [None]:
df.dropna()

In [None]:
df.isna().sum()

**Encoding categorical variable**

In [None]:
# defining Label encoder

label_encoder = LabelEncoder()

# implementing the loop to encode every columns with object

for column in df.select_dtypes(include=['object']).columns:
    df[column] = label_encoder.fit_transform(df[column])

In [None]:
# Splitting features and target

X = df.drop('Revenue', axis=1)
y = df['Revenue']

In [None]:
# Split the data into train and test portions

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Normalising data**

In [None]:
# Normalising data

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### **Model setting up**

In [None]:
# Defining the model

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_initializer='he_normal'),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_initializer='he_normal'),
    Dropout(0.2),
    Dense(16, activation='relu', kernel_initializer='he_normal'),
    Dense(1)
])

In [None]:
# Compiling the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mean_squared_error')

In [None]:
# Defining callbacks
early_stopping = EarlyStopping(patience=20, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(factor=0.2, patience=10, min_lr=1e-6)

In [None]:
# Adding intermediate visualization
epochs = 500 # number of times the model eill train on data
batch_size = 32 # number of samples a model trains on in 1 iterations
validation_split = 0.2 # Validation split for evaluation
interval = 100  # Interval for intermediate visualization

In [None]:
print(X_train_scaled.shape)
print(y_train.shape)

### **Initialising training**

In [None]:
history = model.fit(
    X_train_scaled, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=validation_split,
    verbose=1,
    callbacks=[early_stopping, reduce_lr]
)

### **Plotting mid training curves**

In [None]:
# Intermediate visualization function
def plot_intermediate(history, interval, epochs):
    plt.figure(figsize=(12, 5))
    for i in range(0, epochs, interval):
        plt.subplot(1, 2, 1)
        plt.plot(history.history['loss'][:i+1], label='Training Loss')
        plt.plot(history.history['val_loss'][:i+1], label='Validation Loss')
        plt.title(f'Model Loss at Epoch {i+1}')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

plot_intermediate(history, interval, epochs)

### **Model Evaluation**

In [None]:
# Evaluating the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

In [None]:
# Plotting final training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()