In [234]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [235]:
historical_transactions = pd.read_csv("Historical-transaction-data.csv")
store_info = pd.read_csv("Store-info.csv")

In [236]:
store_info.head()

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile
0,SHOP047,528,Moderate
1,SHOP009,676,High
2,SHOP083,676,Low
3,SHOP117,676,Low
4,SHOP042,676,Low


In [237]:
historical_transactions.head()

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold
0,ORANGE BARLEY 1.5L,2021-12-11T00:00:00.000Z,147.0,BGXA,SHOP008,220,2
1,GINGER BEER 1.5L,2021-10-17T00:00:00.000Z,371.0,IA25,SHOP112,220,2
2,TONIC PET 500ML,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,160,2
3,CREAM SODA 1L,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,150,2
4,STRAWBERRY MILK 180ML,2021-10-23T00:00:00.000Z,1310.0,7S00,SHOP112,210,5


In [238]:
merged_data = pd.merge(historical_transactions, store_info, on="shop_id")

In [239]:
merged_data.head()

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile
0,ORANGE BARLEY 1.5L,2021-12-11T00:00:00.000Z,147.0,BGXA,SHOP008,220,2,678,Moderate
1,TONIC PET 500ML,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,160,2,678,Moderate
2,CREAM SODA 1L,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,150,2,678,Moderate
3,GINGER BEER 1.5L,2021-12-10T00:00:00.000Z,1000053.0,VT9C,SHOP008,220,1,678,Moderate
4,GINGER BEER 1.5L,2021-12-10T00:00:00.000Z,1000057.0,8QLS,SHOP008,440,1,678,Moderate


In [240]:
# Convert transaction_date to datetime format
merged_data['transaction_date'] = pd.to_datetime(merged_data['transaction_date'])

In [241]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 473974 entries, 0 to 473973
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype              
---  ------            --------------   -----              
 0   item_description  438046 non-null  object             
 1   transaction_date  473974 non-null  datetime64[ns, UTC]
 2   invoice_id        467654 non-null  float64            
 3   customer_id       473974 non-null  object             
 4   shop_id           473974 non-null  object             
 5   item_price        473974 non-null  int64              
 6   quantity_sold     473974 non-null  int64              
 7   shop_area_sq_ft   473974 non-null  int64              
 8   shop_profile      387341 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(3), object(4)
memory usage: 36.2+ MB


In [242]:
merged_data.isna().sum()

item_description    35928
transaction_date        0
invoice_id           6320
customer_id             0
shop_id                 0
item_price              0
quantity_sold           0
shop_area_sq_ft         0
shop_profile        86633
dtype: int64

In [243]:
# Calculate sales per transaction
merged_data['sales'] = merged_data['item_price'] * merged_data['quantity_sold']


In [244]:
store_metrics = merged_data.groupby('shop_id').agg({
    'sales': 'sum',
    'invoice_id': 'count',
    'customer_id': pd.Series.nunique
}).reset_index()

store_metrics.columns = ['shop_id', 'total_sales', 'transaction_count', 'unique_customers']
store_metrics['avg_transaction_value'] = store_metrics['total_sales'] / store_metrics['transaction_count']
store_metrics['avg_sales_per_customer'] = store_metrics['total_sales'] / store_metrics['unique_customers']

# Merge the store_metrics dataframe with store_info dataframe
store_profile_data = pd.merge(store_info, store_metrics, on="shop_id")

# Calculate sales per square foot
store_profile_data['sales_per_sq_ft'] = store_profile_data['total_sales'] / store_profile_data['shop_area_sq_ft']


In [245]:
store_profile_data.head()

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile,total_sales,transaction_count,unique_customers,avg_transaction_value,avg_sales_per_customer,sales_per_sq_ft
0,SHOP047,528,Moderate,842960,1687,928,499.679905,908.362069,1596.515152
1,SHOP009,676,High,1970870,4521,2498,435.93674,788.979183,2915.488166
2,SHOP083,676,Low,1691985,3583,1900,472.225788,890.518421,2502.936391
3,SHOP117,676,Low,2325980,4023,2037,578.17052,1141.865488,3440.798817
4,SHOP042,676,Low,1340215,3232,1841,414.670483,727.982075,1982.566568


In [246]:
store_profile_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 0 to 123
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   shop_id                 124 non-null    object 
 1   shop_area_sq_ft         124 non-null    int64  
 2   shop_profile            100 non-null    object 
 3   total_sales             124 non-null    int64  
 4   transaction_count       124 non-null    int64  
 5   unique_customers        124 non-null    int64  
 6   avg_transaction_value   124 non-null    float64
 7   avg_sales_per_customer  124 non-null    float64
 8   sales_per_sq_ft         124 non-null    float64
dtypes: float64(3), int64(4), object(2)
memory usage: 9.7+ KB


In [247]:
store_profile_data.isna().sum()

shop_id                    0
shop_area_sq_ft            0
shop_profile              24
total_sales                0
transaction_count          0
unique_customers           0
avg_transaction_value      0
avg_sales_per_customer     0
sales_per_sq_ft            0
dtype: int64

In [154]:
store_profile_data = store_profile_data.dropna(subset=['shop_profile'])

In [208]:
store_profile_data.isna().sum()

shop_id                    0
shop_area_sq_ft            0
shop_profile              24
total_sales                0
transaction_count          0
unique_customers           0
avg_transaction_value      0
avg_sales_per_customer     0
sales_per_sq_ft            0
dtype: int64

In [248]:
X = store_profile_data[['total_sales', 'transaction_count', 'unique_customers', 'avg_transaction_value', 'avg_sales_per_customer', 'shop_area_sq_ft', 'sales_per_sq_ft']]
y = store_profile_data['shop_profile']

In [249]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [251]:
merged_data['day_of_week'] = merged_data['transaction_date'].dt.dayofweek


In [252]:
merged_data['month'] = merged_data['transaction_date'].dt.month


In [253]:
def month_to_season(month):
    if month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'fall'
    else:
        return 'winter'

merged_data['season'] = merged_data['month'].apply(month_to_season)


In [254]:
# One-hot encoding for day_of_week and season
merged_data = pd.get_dummies(merged_data, columns=['day_of_week', 'season'])


In [255]:
merged_data.head()

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,sales,month,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,season_fall,season_winter
0,ORANGE BARLEY 1.5L,2021-12-11 00:00:00+00:00,147.0,BGXA,SHOP008,220,2,678,Moderate,440,12,0,0,0,0,0,1,0,0,1
1,TONIC PET 500ML,2021-12-13 00:00:00+00:00,484.0,VN7V,SHOP008,160,2,678,Moderate,320,12,1,0,0,0,0,0,0,0,1
2,CREAM SODA 1L,2021-12-13 00:00:00+00:00,484.0,VN7V,SHOP008,150,2,678,Moderate,300,12,1,0,0,0,0,0,0,0,1
3,GINGER BEER 1.5L,2021-12-10 00:00:00+00:00,1000053.0,VT9C,SHOP008,220,1,678,Moderate,220,12,0,0,0,0,1,0,0,0,1
4,GINGER BEER 1.5L,2021-12-10 00:00:00+00:00,1000057.0,8QLS,SHOP008,440,1,678,Moderate,440,12,0,0,0,0,1,0,0,0,1


In [256]:
merged_data.isna().sum()

item_description    35928
transaction_date        0
invoice_id           6320
customer_id             0
shop_id                 0
item_price              0
quantity_sold           0
shop_area_sq_ft         0
shop_profile        86633
sales                   0
month                   0
day_of_week_0           0
day_of_week_1           0
day_of_week_2           0
day_of_week_3           0
day_of_week_4           0
day_of_week_5           0
day_of_week_6           0
season_fall             0
season_winter           0
dtype: int64

In [276]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 473974 entries, 0 to 473973
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype              
---  ------            --------------   -----              
 0   item_description  438046 non-null  object             
 1   transaction_date  473974 non-null  datetime64[ns, UTC]
 2   invoice_id        467654 non-null  float64            
 3   customer_id       473974 non-null  object             
 4   shop_id           473974 non-null  object             
 5   item_price        473974 non-null  int64              
 6   quantity_sold     473974 non-null  int64              
 7   shop_area_sq_ft   473974 non-null  int64              
 8   shop_profile      387341 non-null  object             
 9   sales             473974 non-null  int64              
 10  month             473974 non-null  int64              
 11  day_of_week_0     473974 non-null  uint8              
 12  day_of_week_1     473974 non-null  uint8    

In [275]:
import pandas as pd

# Assuming your DataFrame is named 'data'
grouped_data = merged_data.groupby('shop_id')


In [278]:
# Total sales per shop
total_sales = grouped_data['sales'].sum()

# Average item price per shop
average_item_price = grouped_data['item_price'].mean()

# Total quantity sold per shop
total_quantity_sold = grouped_data['quantity_sold'].sum()

# Average sales per day of the week per shop
day_of_week_columns = [f'day_of_week_{i}' for i in range(7)]
average_sales_day_of_week = grouped_data[day_of_week_columns].mean()

# Seasonal sales (fall and winter) per shop
season_columns = ['season_fall', 'season_winter']
seasonal_sales = grouped_data[season_columns].sum()


In [279]:
shops_data = pd.DataFrame({
    'total_sales': total_sales,
    'average_item_price': average_item_price,
    'total_quantity_sold': total_quantity_sold,
})
shops_data = shops_data.join(average_sales_day_of_week)
shops_data = shops_data.join(seasonal_sales)


In [281]:
# Mode shop_profile per shop (using first mode if multiple modes exist)
shop_profile_mode = grouped_data['shop_profile'].agg(pd.Series.mode).apply(lambda x: x[0] if isinstance(x, pd.Series) else x)

In [282]:
shops_data = shops_data.join(shop_profile_mode)

In [283]:
shops_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 124 entries, SHOP001 to SHOP127
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   total_sales          124 non-null    int64  
 1   average_item_price   124 non-null    float64
 2   total_quantity_sold  124 non-null    int64  
 3   day_of_week_0        124 non-null    float64
 4   day_of_week_1        124 non-null    float64
 5   day_of_week_2        124 non-null    float64
 6   day_of_week_3        124 non-null    float64
 7   day_of_week_4        124 non-null    float64
 8   day_of_week_5        124 non-null    float64
 9   day_of_week_6        124 non-null    float64
 10  season_fall          124 non-null    float64
 11  season_winter        124 non-null    float64
 12  shop_profile         124 non-null    object 
dtypes: float64(10), int64(2), object(1)
memory usage: 17.6+ KB


In [267]:
# Aggregate transaction-level data to store-level
transaction_features = merged_data.groupby('shop_id').agg({
    'month': 'mean',
    'day_of_week_0': 'sum',
    'day_of_week_1': 'sum',
    'day_of_week_2': 'sum',
    'day_of_week_3': 'sum',
    'day_of_week_4': 'sum',
    'day_of_week_5': 'sum',
    'day_of_week_6': 'sum',
    'season_fall': 'sum',
    'season_winter': 'sum',
}).reset_index()

# Merge aggregated transaction-level features into store-level DataFrame
store_profile_data = store_profile_data.merge(transaction_features, on='shop_id')


In [268]:
merged_data.head()

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,sales,month,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,season_fall,season_winter
0,ORANGE BARLEY 1.5L,2021-12-11 00:00:00+00:00,147.0,BGXA,SHOP008,220,2,678,Moderate,440,12,0,0,0,0,0,1,0,0,1
1,TONIC PET 500ML,2021-12-13 00:00:00+00:00,484.0,VN7V,SHOP008,160,2,678,Moderate,320,12,1,0,0,0,0,0,0,0,1
2,CREAM SODA 1L,2021-12-13 00:00:00+00:00,484.0,VN7V,SHOP008,150,2,678,Moderate,300,12,1,0,0,0,0,0,0,0,1
3,GINGER BEER 1.5L,2021-12-10 00:00:00+00:00,1000053.0,VT9C,SHOP008,220,1,678,Moderate,220,12,0,0,0,0,1,0,0,0,1
4,GINGER BEER 1.5L,2021-12-10 00:00:00+00:00,1000057.0,8QLS,SHOP008,440,1,678,Moderate,440,12,0,0,0,0,1,0,0,0,1


In [272]:
store_profile_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 0 to 123
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   shop_id                 124 non-null    object 
 1   shop_area_sq_ft         124 non-null    int64  
 2   shop_profile            100 non-null    object 
 3   total_sales             124 non-null    int64  
 4   transaction_count       124 non-null    int64  
 5   unique_customers        124 non-null    int64  
 6   avg_transaction_value   124 non-null    float64
 7   avg_sales_per_customer  124 non-null    float64
 8   sales_per_sq_ft         124 non-null    float64
 9   month_x                 124 non-null    float64
 10  day_of_week_0_x         124 non-null    float64
 11  day_of_week_1_x         124 non-null    float64
 12  day_of_week_2_x         124 non-null    float64
 13  day_of_week_3_x         124 non-null    float64
 14  day_of_week_4_x         124 non-null    fl

Neural Network Implementation

In [221]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [260]:
# Preprocess the input features by scaling them
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [261]:
# Encode the labels into integers
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

In [262]:
# Define the neural network model
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=X_train_scaled.shape[1]))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(8, activation='relu'))
model.add(Dense(len(encoder.classes_), activation='softmax'))

In [263]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [264]:
# Train the model
history = model.fit(X_train_scaled, y_train_encoded, validation_data=(X_test_scaled, y_test_encoded), epochs=500, batch_size=8)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [265]:
# Evaluate the model
_, test_accuracy = model.evaluate(X_test_scaled, y_test_encoded)
print("Test accuracy: {:.2f}".format(test_accuracy))

Test accuracy: 0.44


In [108]:
# Save the weights
model.save_weights('./acc0.84/my_checkpoint')

In [109]:
store_profile_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 0 to 123
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   shop_id                 124 non-null    object 
 1   shop_area_sq_ft         124 non-null    int64  
 2   shop_profile            100 non-null    object 
 3   total_sales             124 non-null    int64  
 4   transaction_count       124 non-null    int64  
 5   unique_customers        124 non-null    int64  
 6   avg_transaction_value   124 non-null    float64
 7   avg_sales_per_customer  124 non-null    float64
 8   sales_per_sq_ft         124 non-null    float64
dtypes: float64(3), int64(4), object(2)
memory usage: 9.7+ KB


In [115]:
store_profile_data.isna().sum()

shop_id                    0
shop_area_sq_ft            0
shop_profile              24
total_sales                0
transaction_count          0
unique_customers           0
avg_transaction_value      0
avg_sales_per_customer     0
sales_per_sq_ft            0
dtype: int64

Bootstrapping

In [110]:
import pandas as pd
import numpy as np

def create_bootstrapped_dataset(df, n_samples):
    bootstrapped_data = df.sample(n=n_samples, replace=True, random_state=np.random.randint(0, 1e5))
    return bootstrapped_data

# Create 10 bootstrapped datasets
n_bootstraps = 10
bootstrapped_datasets = [create_bootstrapped_dataset(store_profile_data, len(store_profile_data)) for _ in range(n_bootstraps)]


In [111]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

def create_neural_network(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model




In [112]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical

def preprocess_data(df, target_col):
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    le = LabelEncoder()
    y = le.fit_transform(y)

    y = to_categorical(y)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_val, y_train, y_val, le


In [113]:
def preprocess_data(df, target_col):
    df = df.drop(columns=['shop_id'])  # Drop the shop_id column
    X = df.drop(columns=[target_col])
    y = df[target_col]

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    le = LabelEncoder()
    y = le.fit_transform(y)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    return X_train, X_val, y_train, y_val, le

In [114]:
n_bootstraps = 10
ensemble_models = []

for i, bootstrapped_data in enumerate(bootstrapped_datasets):
    print(f"Training model {i + 1}/{n_bootstraps}")

    X_train, X_val, y_train, y_val, le = preprocess_data(bootstrapped_data, target_col='shop_profile')

    model = create_neural_network(input_dim=X_train.shape[1])
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32, verbose=0)

    ensemble_models.append(model)


Training model 1/10


InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits' defined at (most recent call last):
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\traitlets\config\application.py", line 1043, in launch_instance
      app.start()
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelapp.py", line 725, in start
      self.io_loop.start()
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 409, in dispatch_shell
      await result
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\ipykernel\ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\ipykernel\zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 2961, in run_cell
      result = self._run_cell(
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3016, in _run_cell
      result = runner(coro)
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3221, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3400, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\naham\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3460, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\naham\AppData\Local\Temp\ipykernel_17788\3809908725.py", line 10, in <module>
      model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32, verbose=0)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1051, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1109, in compute_loss
      return self.compiled_loss(
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 142, in __call__
      losses = call_fn(y_true, y_pred)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 268, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 2078, in sparse_categorical_crossentropy
      return backend.sparse_categorical_crossentropy(
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\backend.py", line 5660, in sparse_categorical_crossentropy
      res = tf.nn.sparse_softmax_cross_entropy_with_logits(
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits'
Received a label value of 3 which is outside the valid range of [0, 3).  Label values: 1 2 0 3 0 3 0 0 2 0 3 1 2 3 1 2 2 0 0 0 1 0 2 3 2 0 0 1 2 2 1 0
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_142838]

In [84]:
def ensemble_predictions(models, X):
    predictions = []

    for model in models:
        pred = model.predict(X)
        predictions.append(pred)

    return np.mean(predictions, axis=0)

In [85]:
from sklearn.metrics import classification_report, f1_score

# Preprocess the original dataset (without bootstrapping)
X_test, _, y_test, _, le = preprocess_data(store_profile_data, target_col='shop_profile')

# Make predictions using the ensemble
predictions = []
for model in ensemble_models:
    pred = model.predict(X_test)
    pred_classes = np.argmax(pred, axis=1)
    predictions.append(pred_classes)

# Combine predictions from all models (majority vote)
ensemble_predictions = np.array(predictions).T
final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=ensemble_predictions)

# Evaluate the ensemble
print(classification_report(y_test, final_predictions, target_names=le.classes_))
f1 = f1_score(y_test, final_predictions, average='weighted')
print(f"F1 Score: {f1:.2f}")


              precision    recall  f1-score   support

        High       0.65      0.96      0.77        25
         Low       0.72      0.72      0.72        29
    Moderate       0.86      0.46      0.60        26

    accuracy                           0.71        80
   macro avg       0.74      0.72      0.70        80
weighted avg       0.74      0.71      0.70        80

F1 Score: 0.70


In [86]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score

n_bootstraps = 10
ensemble_models_svm = []
ensemble_models_rf = []

# Train SVM and Random Forest models on bootstrapped datasets
for i, bootstrapped_data in enumerate(bootstrapped_datasets):
    print(f"Training SVM and Random Forest {i + 1}/{n_bootstraps}")

    X_train, X_val, y_train, y_val, le = preprocess_data(bootstrapped_data, target_col='shop_profile')

    # Train SVM
    svm = SVC(kernel='linear', C=1, probability=True)
    svm.fit(X_train, y_train)
    ensemble_models_svm.append(svm)

    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=1)
    rf.fit(X_train, y_train)
    ensemble_models_rf.append(rf)

# Preprocess the original dataset (without bootstrapping)
X_test, _, y_test, _, le = preprocess_data(store_profile_data, target_col='shop_profile')

# Make predictions using the SVM ensemble
predictions_svm = []
for model in ensemble_models_svm:
    pred = model.predict_proba(X_test)
    pred_classes = np.argmax(pred, axis=1)
    predictions_svm.append(pred_classes)

# Make predictions using the Random Forest ensemble
predictions_rf = []
for model in ensemble_models_rf:
    pred = model.predict_proba(X_test)
    pred_classes = np.argmax(pred, axis=1)
    predictions_rf.append(pred_classes)

# Combine predictions from SVM models (majority vote)
ensemble_predictions_svm = np.array(predictions_svm).T
final_predictions_svm = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=ensemble_predictions_svm)

# Combine predictions from Random Forest models (majority vote)
ensemble_predictions_rf = np.array(predictions_rf).T
final_predictions_rf = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=ensemble_predictions_rf)

# Evaluate the SVM ensemble
print("\nSVM Ensemble:")
print(classification_report(y_test, final_predictions_svm, target_names=le.classes_))
f1_svm = f1_score(y_test, final_predictions_svm, average='weighted')
print(f"F1 Score: {f1_svm:.2f}")

# Evaluate the Random Forest ensemble
print("\nRandom Forest Ensemble:")
print(classification_report(y_test, final_predictions_rf, target_names=le.classes_))
f1_rf = f1_score(y_test, final_predictions_rf, average='weighted')
print(f"F1 Score: {f1_rf:.2f}")


Training SVM and Random Forest 1/10
Training SVM and Random Forest 2/10
Training SVM and Random Forest 3/10
Training SVM and Random Forest 4/10
Training SVM and Random Forest 5/10
Training SVM and Random Forest 6/10
Training SVM and Random Forest 7/10
Training SVM and Random Forest 8/10
Training SVM and Random Forest 9/10
Training SVM and Random Forest 10/10

SVM Ensemble:
              precision    recall  f1-score   support

        High       0.67      0.80      0.73        25
         Low       0.56      0.76      0.65        29
    Moderate       0.45      0.19      0.27        26

    accuracy                           0.59        80
   macro avg       0.56      0.58      0.55        80
weighted avg       0.56      0.59      0.55        80

F1 Score: 0.55

Random Forest Ensemble:
              precision    recall  f1-score   support

        High       0.79      0.92      0.85        25
         Low       0.79      0.90      0.84        29
    Moderate       0.94      0.65      0

In [87]:
# Evaluate the Random Forest ensemble
print("\nRandom Forest Ensemble:")
print(classification_report(y_test, final_predictions_rf, target_names=le.classes_))
f1_rf = f1_score(y_test, final_predictions_rf, average='weighted')
print(f"F1 Score: {f1_rf:.2f}")



Random Forest Ensemble:
              precision    recall  f1-score   support

        High       0.79      0.92      0.85        25
         Low       0.79      0.90      0.84        29
    Moderate       0.94      0.65      0.77        26

    accuracy                           0.82        80
   macro avg       0.84      0.82      0.82        80
weighted avg       0.84      0.82      0.82        80

F1 Score: 0.82


In [117]:
store_profile_data.isnull().sum()

shop_id                    0
shop_area_sq_ft            0
shop_profile              24
total_sales                0
transaction_count          0
unique_customers           0
avg_transaction_value      0
avg_sales_per_customer     0
sales_per_sq_ft            0
dtype: int64

In [119]:
def preprocess_data(df, target_col=None):
    df = df.drop(columns=['shop_id'])  # Drop the shop_id column
    if target_col is not None:
        X = df.drop(columns=[target_col])
        y = df[target_col]
    else:
        X = df
        y = None

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if y is not None:
        le = LabelEncoder()
        y = le.fit_transform(y)
    else:
        le = None

    return X, y, scaler, le


In [120]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.utils import resample

# Preprocess the data
# Split the dataset into train (with shop profile) and test (missing shop profile) datasets
train_data = store_profile_data[store_profile_data['shop_profile'].notnull()]
test_data = store_profile_data[store_profile_data['shop_profile'].isnull()]

# Prepare train dataset for bootstrapping
X_train, _, y_train, _, le = preprocess_data(train_data, target_col='shop_profile')

# Bootstrapping
n_bootstraps = 10
f1_scores = []
rf_models = []

for i in range(n_bootstraps):
    X_train_boot, y_train_boot = resample(X_train, y_train)
    rf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=1)
    rf.fit(X_train_boot, y_train_boot)
    rf_models.append(rf)
    
    # Evaluate the model on the original train dataset
    y_pred = rf.predict(X_train)
    f1 = f1_score(y_train, y_pred, average='weighted')
    f1_scores.append(f1)

# Choose the best model
best_rf = rf_models[np.argmax(f1_scores)]

# Make predictions on the test dataset
X_test, _, _, _, _ = preprocess_data(test_data.drop(columns='shop_profile'), target_col=None)
predicted_profiles = best_rf.predict(X_test)

# Replace missing shop profile values with the predicted ones
test_data['shop_profile'] = le.inverse_transform(predicted_profiles)

# Store predicted shop profiles in a separate DataFrame
predicted_profiles_df = test_data[['shop_id', 'shop_profile']]

print(predicted_profiles_df)


ValueError: not enough values to unpack (expected 5, got 4)

In [118]:
# Create a DataFrame for the testing data
test_data = pd.DataFrame({
    'shop_id': ['SHOP046', 'SHOP024', 'SHOP023', 'SHOP097', 'SHOP044', 'SHOP030', 'SHOP038', 'SHOP029', 'SHOP096', 'SHOP092', 'SHOP081', 'SHOP076', 'SHOP080', 'SHOP074', 'SHOP107', 'SHOP108', 'SHOP019', 'SHOP002', 'SHOP114', 'SHOP087', 'SHOP050', 'SHOP061', 'SHOP056', 'SHOP070'],
    'shop_profile': [np.nan] * 24
})

# Merge the testing data with the original dataset to get the other features
test_data = test_data.merge(store_profile_data, on='shop_id', how='left')

# Preprocess the testing data
X_test, _, _, _ = preprocess_data(test_data.drop(columns='shop_profile'), target_col=None)

# Make predictions on the test dataset
predicted_profiles = best_rf.predict(X_test)

# Create a DataFrame with the predicted shop profiles
predicted_df = pd.DataFrame({
    'shop_id': test_data['shop_id'],
    'shop_profile': le.inverse_transform(predicted_profiles)
})

print(predicted_df)


KeyError: "['shop_profile'] not found in axis"

In [121]:
store_profile_data.head()

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile,total_sales,transaction_count,unique_customers,avg_transaction_value,avg_sales_per_customer,sales_per_sq_ft
0,SHOP047,528,Moderate,842960,1687,928,499.679905,908.362069,1596.515152
1,SHOP009,676,High,1970870,4521,2498,435.93674,788.979183,2915.488166
2,SHOP083,676,Low,1691985,3583,1900,472.225788,890.518421,2502.936391
3,SHOP117,676,Low,2325980,4023,2037,578.17052,1141.865488,3440.798817
4,SHOP042,676,Low,1340215,3232,1841,414.670483,727.982075,1982.566568


In [123]:
from sklearn.utils import resample

def bootstrap_samples(data, n_samples):
    bootstrapped_datasets = []
    for _ in range(n_samples):
        bootstrapped_data = resample(data, replace=True, n_samples=len(data), random_state=None)
        bootstrapped_datasets.append(bootstrapped_data)
    return bootstrapped_datasets


In [232]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Assuming store_profile_data is already prepared

def preprocess_data(df, target_col=None):
    df = df.drop(columns=['shop_id'])  # Drop the shop_id column
    if target_col is not None:
        X = df.drop(columns=[target_col])
        y = df[target_col]
    else:
        X = df
        y = None

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if y is not None:
        le = LabelEncoder()
        y = le.fit_transform(y)
    else:
        le = None

    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_val, y_train, y_val, le


# Train the random forest model using the bootstrapped datasets
n_bootstraps = 10
bootstrapped_datasets = bootstrap_samples(store_profile_data, n_samples=n_bootstraps)

rf_models = []
f1_scores = []

for i, bootstrapped_data in enumerate(bootstrapped_datasets):
    print(f"Training model {i + 1}/{n_bootstraps}")

    X_train, X_val, y_train, y_val, le = preprocess_data(bootstrapped_data, target_col='shop_profile')

    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)

    f1 = f1_score(y_val, y_pred, average='weighted')
    print(f"F1-score: {f1:.3f}")

    rf_models.append(rf)
    f1_scores.append(f1)

# Select the best random forest model
best_rf = rf_models[np.argmax(f1_scores)]

# Create a DataFrame for the testing data
test_data = pd.DataFrame({
    'shop_id': ['SHOP046', 'SHOP024', 'SHOP023', 'SHOP097', 'SHOP044', 'SHOP030', 'SHOP038', 'SHOP029', 'SHOP096', 'SHOP092', 'SHOP081', 'SHOP076', 'SHOP080', 'SHOP074', 'SHOP107', 'SHOP108', 'SHOP019', 'SHOP002', 'SHOP114', 'SHOP087', 'SHOP050', 'SHOP061', 'SHOP056', 'SHOP070'],
    'shop_profile': [np.nan] * 24
})

# Merge the testing data with the original dataset to get the other features
test_data = test_data.merge(store_profile_data, on='shop_id', how='left')

# Preprocess the testing data
X_test, _, _, _ = preprocess_data(test_data.drop(columns='shop_profile'), target_col=None)

# Make predictions on the test dataset
predicted_profiles = best_rf.predict(X_test)

# Create a DataFrame with the predicted shop profiles
predicted_df = pd.DataFrame({
    'shop_id': test_data['shop_id'],
    'shop_profile': le.inverse_transform(predicted_profiles)
})

print(predicted_df)


Training model 1/10
F1-score: 0.794
Training model 2/10
F1-score: 0.848
Training model 3/10
F1-score: 0.948
Training model 4/10
F1-score: 0.900
Training model 5/10
F1-score: 0.691
Training model 6/10
F1-score: 0.950
Training model 7/10
F1-score: 0.796
Training model 8/10
F1-score: 0.900
Training model 9/10
F1-score: 0.852
Training model 10/10
F1-score: 0.950


KeyError: "['shop_profile'] not found in axis"

In [233]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

def preprocess_data(df, target_col=None):
    df = df.drop(columns=['shop_id'])  # Drop the shop_id column
    if target_col is not None:
        X = df.drop(columns=[target_col])
        y = df[target_col]
    else:
        X = df
        y = None

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if y is not None:
        le = LabelEncoder()
        y = le.fit_transform(y)
    else:
        le = None

    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_val, y_train, y_val, le

def bootstrap_samples(data, n_samples):
    sample_indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = [data.iloc[indices] for indices in sample_indices]
    return samples

# Train the random forest model using the bootstrapped datasets
n_bootstraps = 10
bootstrapped_datasets = bootstrap_samples(store_profile_data.dropna(), n_samples=n_bootstraps)

rf_models = []
f1_scores = []

for i, bootstrapped_data in enumerate(bootstrapped_datasets):
    print(f"Training model {i + 1}/{n_bootstraps}")
    
    X_train, X_val, y_train, y_val, le = preprocess_data(bootstrapped_data, target_col='shop_profile')
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
    print(f"F1-score: {f1:.3f}")
    
    rf_models.append(rf)
    f1_scores.append(f1)

# Choose the best model based on the F1-score
best_rf = rf_models[np.argmax(f1_scores)]




Training model 1/10
F1-score: 0.845
Training model 2/10
F1-score: 0.670
Training model 3/10
F1-score: 0.654
Training model 4/10
F1-score: 0.836
Training model 5/10
F1-score: 0.848
Training model 6/10
F1-score: 0.848
Training model 7/10
F1-score: 0.592
Training model 8/10
F1-score: 0.655
Training model 9/10
F1-score: 0.718
Training model 10/10
F1-score: 0.900


In [130]:
print(best_rf)

RandomForestClassifier(random_state=42)


In [131]:
from sklearn.model_selection import train_test_split

# Find rows with missing shop profiles
missing_shop_profiles = store_profile_data[store_profile_data['shop_profile'].isnull()][['shop_id']]

# Preprocess the data
X_missing = missing_shop_profiles.merge(store_profile_data, on='shop_id', how='left').drop(columns=['shop_id', 'shop_profile'])

scaler = StandardScaler()
X_missing = scaler.fit_transform(X_missing)

# Make predictions on the test dataset
predicted_profiles = best_rf.predict(X_missing)
predicted_profiles = le.inverse_transform(predicted_profiles)  # Convert the encoded labels back to the original labels

# Replace missing shop profile values with the predicted ones
missing_shop_profiles['shop_profile'] = predicted_profiles


In [133]:
missing_shop_profiles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 100 to 123
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   shop_id       24 non-null     object
 1   shop_profile  24 non-null     object
dtypes: object(2)
memory usage: 576.0+ bytes


In [134]:
missing_shop_profiles.to_csv("predicted_missing_shop_profiles.csv", index=False)


In [229]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

def preprocess_data(df, target_col=None):
    df = df.drop(columns=['shop_id'])
    if target_col is not None:
        X = df.drop(columns=[target_col])
        y = df[target_col]
    else:
        X = df
        y = None

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if y is not None:
        le = LabelEncoder()
        y = le.fit_transform(y)
    else:
        le = None

    return X, y, scaler, le

def create_neural_network(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(output_dim, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Preprocess the data and split it into training and validation sets
store_profile_data = store_profile_data.dropna()
X, y, _, le = preprocess_data(store_profile_data, target_col='shop_profile')
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the neural network
input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))
model = create_neural_network(input_dim, output_dim)
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=200, batch_size=16)




Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x270bcdd9070>

In [231]:
store_profile_data.isnull().sum()

shop_id                   0
shop_area_sq_ft           0
shop_profile              0
total_sales               0
transaction_count         0
unique_customers          0
avg_transaction_value     0
avg_sales_per_customer    0
sales_per_sq_ft           0
dtype: int64

In [230]:
# Prepare the test data
test_data = store_data[store_data['shop_profile'].isnull()]
X_test, _, _, _ = preprocess_data(test_data.drop(columns='shop_profile'), target_col=None)

# Make predictions on the test dataset
predicted_profiles = model.predict_classes(X_test)
predicted_profiles = le.inverse_transform(predicted_profiles)

# Combine the shop_id and the predicted shop profiles
missing_shop_profiles = pd.DataFrame({'shop_id': test_data['shop_id'], 'shop_profile': predicted_profiles})

# Save the missing shop profiles as a CSV file
missing_shop_profiles.to_csv("predicted_missing_shop_profiles_nn.csv", index=False)


NameError: name 'store_data' is not defined

In [193]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def preprocess_data(df, target_col=None):
    df = df.drop(columns=['shop_id'])
    if target_col is not None:
        X = df.drop(columns=[target_col])
        y = df[target_col]
    else:
        X = df
        y = None

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if y is not None:
        le = LabelEncoder()
        y = le.fit_transform(y)
    else:
        le = None

    return X, y, scaler, le

def create_neural_network(input_dim, output_dim):
   model = Sequential()
   model.add(Dense(32, activation='relu', input_dim=X_train_scaled.shape[1]))
   model.add(Dropout(0.2))
   model.add(Dense(16, activation='relu'))
   model.add(Dropout(0.2))
   model.add(Dense(8, activation='relu'))
   model.add(Dense(len(encoder.classes_), activation='softmax'))
   model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   return model

# Load the saved model
input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))
model = create_neural_network(input_dim, output_dim)
model.load_weights('./acc0.84/my_checkpoint')

# Prepare the test data
test_data = store_profile_data[store_profile_data['shop_profile'].isnull()]
X_test, _, _, _ = preprocess_data(test_data.drop(columns='shop_profile'), target_col=None)

# Make predictions on the test dataset
predicted_profiles = model.predict_classes(X_test)
predicted_profiles = le.inverse_transform(predicted_profiles)

# Combine the shop_id and the predicted shop profiles
missing_shop_profiles = pd.DataFrame({'shop_id': test_data['shop_id'], 'shop_profile': predicted_profiles})


ValueError: Received incompatible tensor with shape (17, 32) when attempting to restore variable with shape (16, 32) and name dense_76/kernel:0.

In [194]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

def create_neural_network(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(output_dim, activation='softmax'))
    
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model


In [195]:
input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))
model = create_neural_network(input_dim, output_dim)
model.load_weights('./acc0.84/my_checkpoint')

# Prepare the test data
test_data = store_profile_data[store_profile_data['shop_profile'].isnull()]
X_test = test_data.drop(columns=['shop_id', 'shop_profile'])

# Preprocess the test data
X_test_scaled = scaler.transform(X_test)

# Make predictions on the test dataset
predicted_profiles = model.predict(X_test_scaled)
predicted_profiles = np.argmax(predicted_profiles, axis=1)
predicted_labels = encoder.inverse_transform(predicted_profiles)

# Create a DataFrame with the shop_id and predicted shop_profile
missing_shop_profiles = pd.DataFrame({'shop_id': test_data['shop_id'], 'shop_profile': predicted_labels})

# Save the missing_shop_profiles DataFrame to a CSV file
missing_shop_profiles.to_csv('missing_shop_profiles.csv', index=False)


ValueError: Received incompatible tensor with shape (17, 32) when attempting to restore variable with shape (16, 32) and name dense_80/kernel:0.