In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# Reload the dataset
file_path = '/content/drive/MyDrive/ruhan/data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()
#data=data[1:3000]

Unnamed: 0,Date,Visits,HHS,Price,MRP,Discount,Feature,TPR,Display,Wind Speed,Precipitation,Tmax,Tmin,Tavg,Gasoline Price,Diesel Price,Unemployment,Econ_Index,Sales
0,03-01-2009,0.477909,1.916163,2.241061,-1.153256,0.29249,-1.108803,0.029864,-0.894397,9.058333,11.082903,8.854501,8.901679,10.482876,6.172205,6.548854,8.874537,7.97354,2.206254
1,07-01-2009,-0.119089,-1.642738,-1.581899,1.126124,1.26716,-0.307052,0.956087,1.561272,10.218779,7.696473,10.492287,8.636826,9.415671,8.794827,13.701815,9.972008,7.88379,0.868935
2,10-01-2009,-0.066975,0.327336,0.453339,0.002602,-2.050314,0.493035,-0.39288,-1.323589,9.765773,9.395913,6.873221,8.402802,9.077815,12.111822,10.685099,6.473272,10.647103,0.410874
3,11-01-2009,-0.09887,0.449552,0.308392,-0.396971,-1.118038,1.647679,-0.040481,0.129102,11.947505,9.039843,9.396631,7.644043,7.606826,11.622727,12.709938,9.854994,12.005865,1.554823
4,13-01-2009,0.630808,0.84535,1.363512,-0.208058,-0.300008,-0.290472,-0.094031,0.022473,10.171042,9.400105,9.938099,5.914005,9.5597,10.712082,12.953197,8.962563,8.382174,0.176334


In [3]:
# Define column groups for each category
categories = {
    "Point-of-Sales": ["Visits", "HHS", "Price", "MRP"],
    "Promotions": ["Discount", "Feature", "TPR", "Display"],
    "Store": [],  # Add store-related columns if available
    "Weather": ["Wind Speed", "Precipitation", "Tmax", "Tmin", "Tavg"],
    "Economic Indicators": ["Gasoline Price", "Diesel Price", "Unemployment", "Econ_Index"]
}

# Bin the continuous Sales variable into discrete categories
sales_bins = pd.qcut(data["Sales"], q=3, labels=["Low", "Medium", "High"])
data["Sales_Category"] = sales_bins

# Use the binned Sales_Category as the target variable
y = data["Sales_Category"]

# Sample a smaller subset of the dataset for each category to reduce memory usage
smaller_sampled_data = data.sample(n=500, random_state=42)
y_smaller_sampled = smaller_sampled_data["Sales_Category"]

# Initialize an empty dictionary to store PCA and NCA summaries for the smaller sampled data
pca_nca_summaries = {}

for category, columns in categories.items():
    # Check if the category has columns
    if not columns:
        continue

    # Select features for the category
    X_smaller_sampled = smaller_sampled_data[columns]

    # Standardize features
    scaler = StandardScaler()
    X_smaller_sampled_scaled = scaler.fit_transform(X_smaller_sampled)

    # Apply PCA
    pca = PCA(n_components=min(len(columns), 5))
    X_pca = pca.fit_transform(X_smaller_sampled_scaled)
    explained_variance_pca = pca.explained_variance_ratio_
    cumulative_variance_pca = np.cumsum(explained_variance_pca)

    # Store PCA results
    pca_summary = pd.DataFrame({
        "Component": range(1, len(explained_variance_pca) + 1),
        "Total": pca.explained_variance_,
        "% of var": explained_variance_pca,
        "Cumulative %": cumulative_variance_pca
    })


    # Store both summaries
    pca_nca_summaries[category] = {"PCA": pca_summary}

# Display the PCA and NCA summaries for all categories on the smaller sampled data
for category, summaries in pca_nca_summaries.items():
    print(f"\nPCA Summary for {category}:")
    print(summaries["PCA"])




PCA Summary for Point-of-Sales:
   Component     Total  % of var  Cumulative %
0          1  2.344546  0.584964      0.584964
1          2  1.620497  0.404314      0.989278
2          3  0.041232  0.010287      0.999566
3          4  0.001740  0.000434      1.000000

PCA Summary for Promotions:
   Component     Total  % of var  Cumulative %
0          1  1.701650  0.424562      0.424562
1          2  1.116466  0.278558      0.703120
2          3  1.022201  0.255039      0.958159
3          4  0.167698  0.041841      1.000000

PCA Summary for Weather:
   Component     Total  % of var  Cumulative %
0          1  1.109590  0.221474      0.221474
1          2  1.080185  0.215605      0.437079
2          3  1.006924  0.200982      0.638061
3          4  0.955361  0.190690      0.828751
4          5  0.857959  0.171249      1.000000

PCA Summary for Economic Indicators:
   Component     Total  % of var  Cumulative %
0          1  1.066514  0.266095      0.266095
1          2  1.057572  0.26

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Ensure 'Date' column is in datetime format (handling different formats)
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y', errors='coerce')

# Drop rows where date conversion failed
data = data.dropna(subset=['Date'])

# Drop non-numeric columns before aggregation
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
weekly_data = data.groupby(data['Date'].dt.to_period('W'))[numeric_columns].mean()

# Define features and target
X = weekly_data.drop(columns=["Sales"]).values  # Only numeric features
y = weekly_data["Sales"].values  # Target variable (continuous)

# Shift target variable by 1 week for next-week prediction
y_shifted = np.roll(y, -1)
y_shifted[-1] = np.nan  # Last week's prediction is undefined (assign NaN)
weekly_data = weekly_data[:-1]  # Drop last row with NaN target
X = X[:-1]
y = y_shifted[:-1]

# Split dataset into training and testing sets (train: earlier weeks, test: later weeks)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components=min(X_train_scaled.shape[1], 5))
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Reshape data for LSTM (LSTM expects 3D input: [samples, timesteps, features])
X_train_lstm = X_train_pca.reshape((X_train_pca.shape[0], 1, X_train_pca.shape[1]))
X_test_lstm = X_test_pca.reshape((X_test_pca.shape[0], 1, X_test_pca.shape[1]))

# Build the LSTM model
model = Sequential([
    LSTM(50, activation='relu', return_sequences=True, input_shape=(1, X_train_pca.shape[1])),
    Dropout(0.2),
    LSTM(50, activation='relu'),
    Dropout(0.2),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train_lstm, y_train, epochs=100, batch_size=16, validation_data=(X_test_lstm, y_test), verbose=1)

# Make predictions
y_pred = model.predict(X_test_lstm)

# Evaluate model performance
lstm_mae = mean_absolute_error(y_test, y_pred)
lstm_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print evaluation results
print("LSTM Model Performance:")
print(f"MAE: {lstm_mae}")
print(f"RMSE: {lstm_rmse}")


Epoch 1/100


  super().__init__(**kwargs)


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 105ms/step - loss: 2.2384 - val_loss: 1.9567
Epoch 2/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.8661 - val_loss: 1.4044
Epoch 3/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.2551 - val_loss: 0.6641
Epoch 4/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.7205 - val_loss: 0.4662
Epoch 5/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.5219 - val_loss: 0.4181
Epoch 6/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.4615 - val_loss: 0.3833
Epoch 7/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.3805 - val_loss: 0.3415
Epoch 8/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.4153 - val_loss: 0.3113
Epoch 9/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3