In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [27]:
historical_transactions = pd.read_csv("Historical-transaction-data.csv")
store_info = pd.read_csv("Store-info.csv")

In [28]:
store_info.head()

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile
0,SHOP047,528,Moderate
1,SHOP009,676,High
2,SHOP083,676,Low
3,SHOP117,676,Low
4,SHOP042,676,Low


In [29]:
historical_transactions.head()

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold
0,ORANGE BARLEY 1.5L,2021-12-11T00:00:00.000Z,147.0,BGXA,SHOP008,220,2
1,GINGER BEER 1.5L,2021-10-17T00:00:00.000Z,371.0,IA25,SHOP112,220,2
2,TONIC PET 500ML,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,160,2
3,CREAM SODA 1L,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,150,2
4,STRAWBERRY MILK 180ML,2021-10-23T00:00:00.000Z,1310.0,7S00,SHOP112,210,5


In [30]:
merged_data = pd.merge(historical_transactions, store_info, on="shop_id")

In [31]:
merged_data.head()

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile
0,ORANGE BARLEY 1.5L,2021-12-11T00:00:00.000Z,147.0,BGXA,SHOP008,220,2,678,Moderate
1,TONIC PET 500ML,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,160,2,678,Moderate
2,CREAM SODA 1L,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,150,2,678,Moderate
3,GINGER BEER 1.5L,2021-12-10T00:00:00.000Z,1000053.0,VT9C,SHOP008,220,1,678,Moderate
4,GINGER BEER 1.5L,2021-12-10T00:00:00.000Z,1000057.0,8QLS,SHOP008,440,1,678,Moderate


In [32]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 473974 entries, 0 to 473973
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   item_description  438046 non-null  object 
 1   transaction_date  473974 non-null  object 
 2   invoice_id        467654 non-null  float64
 3   customer_id       473974 non-null  object 
 4   shop_id           473974 non-null  object 
 5   item_price        473974 non-null  int64  
 6   quantity_sold     473974 non-null  int64  
 7   shop_area_sq_ft   473974 non-null  int64  
 8   shop_profile      387341 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 36.2+ MB


In [33]:
# Convert transaction_date to datetime format
merged_data['transaction_date'] = pd.to_datetime(merged_data['transaction_date'])

In [34]:
merged_data.isna().sum()

item_description    35928
transaction_date        0
invoice_id           6320
customer_id             0
shop_id                 0
item_price              0
quantity_sold           0
shop_area_sq_ft         0
shop_profile        86633
dtype: int64

In [35]:
# Calculate sales per transaction
merged_data['sales'] = merged_data['item_price'] * merged_data['quantity_sold']


In [36]:
store_metrics = merged_data.groupby('shop_id').agg({
    'sales': 'sum',
    'invoice_id': 'count',
    'customer_id': pd.Series.nunique
}).reset_index()

store_metrics.columns = ['shop_id', 'total_sales', 'transaction_count', 'unique_customers']
store_metrics['avg_transaction_value'] = store_metrics['total_sales'] / store_metrics['transaction_count']
store_metrics['avg_sales_per_customer'] = store_metrics['total_sales'] / store_metrics['unique_customers']

# Merge the store_metrics dataframe with store_info dataframe
store_profile_data = pd.merge(store_info, store_metrics, on="shop_id")

# Calculate sales per square foot
store_profile_data['sales_per_sq_ft'] = store_profile_data['total_sales'] / store_profile_data['shop_area_sq_ft']


In [37]:
store_profile_data.head()

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile,total_sales,transaction_count,unique_customers,avg_transaction_value,avg_sales_per_customer,sales_per_sq_ft
0,SHOP047,528,Moderate,842960,1687,928,499.679905,908.362069,1596.515152
1,SHOP009,676,High,1970870,4521,2498,435.93674,788.979183,2915.488166
2,SHOP083,676,Low,1691985,3583,1900,472.225788,890.518421,2502.936391
3,SHOP117,676,Low,2325980,4023,2037,578.17052,1141.865488,3440.798817
4,SHOP042,676,Low,1340215,3232,1841,414.670483,727.982075,1982.566568


In [38]:
store_profile_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 0 to 123
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   shop_id                 124 non-null    object 
 1   shop_area_sq_ft         124 non-null    int64  
 2   shop_profile            100 non-null    object 
 3   total_sales             124 non-null    int64  
 4   transaction_count       124 non-null    int64  
 5   unique_customers        124 non-null    int64  
 6   avg_transaction_value   124 non-null    float64
 7   avg_sales_per_customer  124 non-null    float64
 8   sales_per_sq_ft         124 non-null    float64
dtypes: float64(3), int64(4), object(2)
memory usage: 9.7+ KB


In [39]:
store_profile_data.describe()

Unnamed: 0,shop_area_sq_ft,total_sales,transaction_count,unique_customers,avg_transaction_value,avg_sales_per_customer,sales_per_sq_ft
count,124.0,124.0,124.0,124.0,124.0,124.0,124.0
mean,619.991935,2152114.0,3771.403226,2015.637097,573.318033,1074.564327,3621.636246
std,126.770165,795323.9,1097.712632,569.06722,142.996928,296.598633,1584.463586
min,298.0,131245.0,272.0,189.0,378.430138,606.746753,193.576696
25%,553.25,1639405.0,3100.0,1629.25,491.111453,899.888381,2671.583084
50%,617.0,2062812.0,3754.5,2034.0,548.185185,1025.434367,3349.57848
75%,676.0,2617150.0,4435.25,2429.0,624.249322,1156.456908,4366.959107
max,1077.0,6051205.0,6555.0,3501.0,1358.704979,2785.271444,10976.912752


In [40]:
unknown_shop_profile.head()

Unnamed: 0,shop_area_sq_ft,shop_profile,total_sales,transaction_count,unique_customers,avg_transaction_value,avg_sales_per_customer,sales_per_sq_ft,shop_id_SHOP002,shop_id_SHOP003,...,shop_id_SHOP118,shop_id_SHOP119,shop_id_SHOP120,shop_id_SHOP121,shop_id_SHOP122,shop_id_SHOP123,shop_id_SHOP124,shop_id_SHOP125,shop_id_SHOP126,shop_id_SHOP127
100,545,High,2607865,3398,1736,767.470571,1502.226382,4785.073394,0,0,...,0,0,0,0,0,0,0,0,0,0
101,676,Low,1308795,2869,1641,456.185082,797.559415,1936.087278,0,0,...,0,0,0,0,0,0,0,0,0,0
102,617,Moderate,2570425,4043,2227,635.771704,1154.209699,4166.004862,0,0,...,0,0,0,0,0,0,0,0,0,0
103,310,High,2642380,3109,1669,849.913155,1583.211504,8523.806452,0,0,...,0,0,0,0,0,0,0,0,0,0
104,715,Moderate,1310440,2631,1552,498.076777,844.35567,1832.783217,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Read the data
# df = pd.read_csv('store_profile_data.csv')

# Assuming your data is already in a DataFrame called df
# Split the dataset into known and unknown shop_profile
known_shop_profile = store_profile_data[store_profile_data['shop_profile'].notna()]
unknown_shop_profile = store_profile_data[store_profile_data['shop_profile'].isna()]

# Encode categorical variables for the known_shop_profile dataset
le = LabelEncoder()
known_shop_profile['shop_profile'] = le.fit_transform(known_shop_profile['shop_profile'])

# Split the dataset
X = known_shop_profile.drop('shop_profile', axis=1)
y = known_shop_profile['shop_profile']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose and train the classification algorithm
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Fine-tune the model if needed

# Predict shop profiles for the unknown_shop_profile dataset
X_unknown = unknown_shop_profile.drop('shop_profile', axis=1)
unknown_shop_profiles_pred = clf.predict(X_unknown)

# Decode the predicted labels back to the original categories
unknown_shop_profiles_pred_decoded = le.inverse_transform(unknown_shop_profiles_pred)

# Fill the missing shop_profile values in the original dataset
unknown_shop_profile['shop_profile'] = unknown_shop_profiles_pred_decoded
store_profile_data.loc[store_profile_data['shop_profile'].isna(), 'shop_profile'] = unknown_shop_profile['shop_profile']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  known_shop_profile['shop_profile'] = le.fit_transform(known_shop_profile['shop_profile'])


ValueError: could not convert string to float: 'SHOP086'

In [None]:
print(unknown_shop_profiles_pred)

[0 1 2 0 2 0 1 1 0 1 0 2 1 0 0 1 2 2 1 1 2 0 0 1]


In [None]:
unknown_shop_profile.head()

Unnamed: 0,shop_area_sq_ft,shop_profile,total_sales,transaction_count,unique_customers,avg_transaction_value,avg_sales_per_customer,sales_per_sq_ft,shop_id_SHOP002,shop_id_SHOP003,...,shop_id_SHOP118,shop_id_SHOP119,shop_id_SHOP120,shop_id_SHOP121,shop_id_SHOP122,shop_id_SHOP123,shop_id_SHOP124,shop_id_SHOP125,shop_id_SHOP126,shop_id_SHOP127
100,545,High,2607865,3398,1736,767.470571,1502.226382,4785.073394,0,0,...,0,0,0,0,0,0,0,0,0,0
101,676,Low,1308795,2869,1641,456.185082,797.559415,1936.087278,0,0,...,0,0,0,0,0,0,0,0,0,0
102,617,Moderate,2570425,4043,2227,635.771704,1154.209699,4166.004862,0,0,...,0,0,0,0,0,0,0,0,0,0
103,310,High,2642380,3109,1669,849.913155,1583.211504,8523.806452,0,0,...,0,0,0,0,0,0,0,0,0,0
104,715,Moderate,1310440,2631,1552,498.076777,844.35567,1832.783217,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Read the data
df = store_profile_data

# Assuming your data is already in a DataFrame called df
# Split the dataset into known and unknown shop_profile
known_shop_profile = df[df['shop_profile'].notna()]
unknown_shop_profile = df[df['shop_profile'].isna()]

# Encode categorical variables for the known_shop_profile dataset
le = LabelEncoder()
known_shop_profile['shop_profile'] = le.fit_transform(known_shop_profile['shop_profile'])

# Split the dataset
X = known_shop_profile.drop(['shop_profile', 'shop_id'], axis=1)
y = known_shop_profile['shop_profile']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose and train the classification algorithm
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Fine-tune the model if needed

# Predict shop profiles for the unknown_shop_profile dataset
X_unknown = unknown_shop_profile.drop(['shop_profile', 'shop_id'], axis=1)
unknown_shop_profiles_pred = clf.predict(X_unknown)

# Decode the predicted labels back to the original categories
unknown_shop_profiles_pred_decoded = le.inverse_transform(unknown_shop_profiles_pred)

# Fill the missing shop_profile values in the original dataset
unknown_shop_profile['shop_profile'] = unknown_shop_profiles_pred_decoded
df.loc[df['shop_profile'].isna(), 'shop_profile'] = unknown_shop_profile['shop_profile']


              precision    recall  f1-score   support

           0       0.71      0.71      0.71         7
           1       0.60      0.43      0.50         7
           2       0.50      0.67      0.57         6

    accuracy                           0.60        20
   macro avg       0.60      0.60      0.60        20
weighted avg       0.61      0.60      0.60        20



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  known_shop_profile['shop_profile'] = le.fit_transform(known_shop_profile['shop_profile'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_shop_profile['shop_profile'] = unknown_shop_profiles_pred_decoded


In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Read the data
df = store_profile_data

# Assuming your data is already in a DataFrame called df
# Split the dataset into known and unknown shop_profile
known_shop_profile = df[df['shop_profile'].notna()]
unknown_shop_profile = df[df['shop_profile'].isna()]

# Encode categorical variables for the known_shop_profile dataset
le = LabelEncoder()
known_shop_profile['shop_profile'] = le.fit_transform(known_shop_profile['shop_profile'])

# Split the dataset
X = known_shop_profile.drop(['shop_profile', 'shop_id'], axis=1)
y = known_shop_profile['shop_profile']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = mlp.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

# Fine-tune the model if needed

# Predict shop profiles for the unknown_shop_profile dataset
X_unknown = unknown_shop_profile.drop(['shop_profile', 'shop_id'], axis=1)
X_unknown_scaled = scaler.transform(X_unknown)

unknown_shop_profiles_pred = mlp.predict(X_unknown_scaled)

# Decode the predicted labels back to the original categories
unknown_shop_profiles_pred_decoded = le.inverse_transform(unknown_shop_profiles_pred)

# Fill the missing shop_profile values in the original dataset
unknown_shop_profile['shop_profile'] = unknown_shop_profiles_pred_decoded
df.loc[df['shop_profile'].isna(), 'shop_profile'] = unknown_shop_profile['shop_profile']


              precision    recall  f1-score   support

           0       0.80      0.44      0.57         9
           1       0.46      1.00      0.63         6
           2       0.43      0.30      0.35        10

    accuracy                           0.52        25
   macro avg       0.56      0.58      0.52        25
weighted avg       0.57      0.52      0.50        25





ValueError: Found array with 0 sample(s) (shape=(0, 7)) while a minimum of 1 is required by StandardScaler.

In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Read the data
# df = pd.read_csv('store_profile_data.csv')

# Assuming your data is already in a DataFrame called df
# Split the dataset into known and unknown shop_profile
known_shop_profile = df[df['shop_profile'].notna()]
unknown_shop_profile = df[df['shop_profile'].isna()]

# Encode categorical variables for the known_shop_profile dataset
le = LabelEncoder()
known_shop_profile['shop_profile'] = le.fit_transform(known_shop_profile['shop_profile'])

# Split the dataset
X = known_shop_profile.drop(['shop_profile', 'shop_id'], axis=1)
y = known_shop_profile['shop_profile']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# One-hot encode the target variable
y_train_one_hot = tf.keras.utils.to_categorical(y_train)
y_test_one_hot = tf.keras.utils.to_categorical(y_test)

from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

# Create the neural network model
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(len(le.classes_), activation='softmax'))


# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train_one_hot, epochs=200, batch_size=8, validation_data=(X_test_scaled, y_test_one_hot))

# Evaluate the model
_, accuracy = model.evaluate(X_test_scaled, y_test_one_hot)
print('Test accuracy:', accuracy)



Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 0 to 123
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   shop_id                 124 non-null    object 
 1   shop_area_sq_ft         124 non-null    int64  
 2   shop_profile            124 non-null    object 
 3   total_sales             124 non-null    int64  
 4   transaction_count       124 non-null    int64  
 5   unique_customers        124 non-null    int64  
 6   avg_transaction_value   124 non-null    float64
 7   avg_sales_per_customer  124 non-null    float64
 8   sales_per_sq_ft         124 non-null    float64
dtypes: float64(3), int64(4), object(2)
memory usage: 9.7+ KB


In [55]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Assuming your data is already in a DataFrame called df
# Encode categorical variables for the known_shop_profile dataset
le = LabelEncoder()
df['shop_profile'] = le.fit_transform(df['shop_profile'])

# Separate features and target variable
X = df.drop(['shop_profile', 'shop_id'], axis=1)
y = df['shop_profile']

# Apply SMOTE to generate synthetic data points
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine the resampled features and target into a new DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['shop_profile'] = y_resampled

# Get the shop_id column from the original DataFrame
shop_id_column = df['shop_id'].reset_index(drop=True)

# Add the shop_id column to the resampled DataFrame
df_resampled = pd.concat([shop_id_column, df_resampled], axis=1)

# Decode the shop_profile labels back to the original categories
df_resampled['shop_profile'] = le.inverse_transform(df_resampled['shop_profile'])

# Check the new size of the dataset
print(df_resampled.shape)


(135, 9)


In [56]:
df_resampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   shop_id                 124 non-null    object 
 1   shop_area_sq_ft         135 non-null    int64  
 2   total_sales             135 non-null    int64  
 3   transaction_count       135 non-null    int64  
 4   unique_customers        135 non-null    int64  
 5   avg_transaction_value   135 non-null    float64
 6   avg_sales_per_customer  135 non-null    float64
 7   sales_per_sq_ft         135 non-null    float64
 8   shop_profile            135 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 9.6+ KB


In [57]:
import numpy as np

def generate_synthetic_data(df, n_samples=1000, sigma=0.1):
    df_numeric = df.select_dtypes(include=[np.number])
    df_non_numeric = df.select_dtypes(exclude=[np.number])
    synthetic_data = []
    
    for _ in range(n_samples):
        random_sample = df_numeric.sample(n=1)
        noise = np.random.normal(0, sigma, random_sample.shape)
        synthetic_sample = random_sample + noise
        synthetic_data.append(synthetic_sample)
        
    synthetic_data = pd.concat(synthetic_data)
    synthetic_data = synthetic_data.reset_index(drop=True)
    synthetic_data = pd.concat([df_non_numeric, synthetic_data], axis=1)
    
    return synthetic_data

# Generate synthetic data
n_samples = 1000
synthetic_data = generate_synthetic_data(df, n_samples=n_samples, sigma=0.1)

# Combine the original and synthetic data
combined_data = pd.concat([df, synthetic_data], ignore_index=True)


In [58]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1124 entries, 0 to 1123
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   shop_id                 248 non-null    object 
 1   shop_area_sq_ft         1124 non-null   float64
 2   shop_profile            1124 non-null   float64
 3   total_sales             1124 non-null   float64
 4   transaction_count       1124 non-null   float64
 5   unique_customers        1124 non-null   float64
 6   avg_transaction_value   1124 non-null   float64
 7   avg_sales_per_customer  1124 non-null   float64
 8   sales_per_sq_ft         1124 non-null   float64
dtypes: float64(8), object(1)
memory usage: 79.2+ KB


In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Read the data
df = combined_data
# Assuming your data is already in a DataFrame called df
# Split the dataset into known and unknown shop_profile
known_shop_profile = df[df['shop_profile'].notna()]
unknown_shop_profile = df[df['shop_profile'].isna()]

# Encode categorical variables for the known_shop_profile dataset
le = LabelEncoder()
known_shop_profile['shop_profile'] = le.fit_transform(known_shop_profile['shop_profile'])

# Split the dataset
X = known_shop_profile.drop(['shop_profile', 'shop_id'], axis=1)
y = known_shop_profile['shop_profile']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# One-hot encode the target variable
y_train_one_hot = tf.keras.utils.to_categorical(y_train)
y_test_one_hot = tf.keras.utils.to_categorical(y_test)

from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

# Create the neural network model
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(len(le.classes_), activation='softmax'))


# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train_one_hot, epochs=200, batch_size=8, validation_data=(X_test_scaled, y_test_one_hot))

# Evaluate the model
_, accuracy = model.evaluate(X_test_scaled, y_test_one_hot)
print('Test accuracy:', accuracy)



Epoch 1/200

ValueError: in user code:

    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1852, in test_function  *
        return step_function(self, iterator)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1836, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1824, in run_step  **
        outputs = model.test_step(data)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1790, in test_step
        self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1109, in compute_loss
        return self.compiled_loss(
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 1984, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "c:\Users\naham\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\backend.py", line 5559, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 999) and (None, 1003) are incompatible
