In [130]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
import random
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import xgboost
from datetime import datetime
from itertools import product

In [92]:
df_original = pd.read_csv(r"Dataset\sales_train.csv\sales_train.csv")

In [93]:
df_original.columns



Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day'],
      dtype='object')

In [94]:
df_sub = df_original[['date_block_num','shop_id', 'item_id','item_cnt_day']]

In [95]:
# Adding 'item_price' to the new dataframe
df_sub['item_price'] = df_original['item_price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['item_price'] = df_original['item_price']


In [96]:
# Calculating the total sales for each month for each store
# Total sales is item_cnt_day * item_price
df_sub['total_sales'] = df_sub['item_cnt_day'] * df_sub['item_price']

In [97]:
# Grouping by 'date_block_num' and 'shop_id' to calculate the total sales per store per month
monthly_sales = df_sub.groupby(['date_block_num', 'shop_id'])['total_sales'].sum().reset_index()

In [98]:
# To calculate the previous month's total sales, we shift the total sales within each group
monthly_sales['prev_month_sales'] = monthly_sales.groupby('shop_id')['total_sales'].shift(1)

In [99]:
# Merging this information back into the original new_df
df_sub = pd.merge(df_sub, monthly_sales[['date_block_num', 'shop_id', 'prev_month_sales']], on=['date_block_num', 'shop_id'], how='left')

In [100]:
df_sub = df_sub.fillna(0)

df_sub.isna().sum()

date_block_num      0
shop_id             0
item_id             0
item_cnt_day        0
item_price          0
total_sales         0
prev_month_sales    0
dtype: int64

In [101]:
feature_names = ['item_cnt_day','prev_month_sales', 'date_block_num']
label = ['total_sales']
X = np.c_[df_sub[feature_names]]
y = np.c_[df_sub[label]]


In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [103]:
model_LR = LinearRegression()
model_LR.fit(X=X, y=y)

In [104]:
y_train_pred = model_LR.predict(X_train)
y_train_pred

array([[1128.2728914 ],
       [ 765.88182324],
       [ 783.57186809],
       ...,
       [1201.67396248],
       [ 932.41409412],
       [1325.14775667]])

In [105]:
y_test_pred = model_LR.predict(X_test)
y_test_pred

array([[1360.09544736],
       [1199.36844722],
       [1051.63502007],
       ...,
       [ 651.46292473],
       [ 881.60737424],
       [ 682.76325816]])

In [106]:
y_test_pred_naiv = np.full(y_train_pred.shape, np.mean(y_train))
y_test_pred_naiv

array([[1156.22236698],
       [1156.22236698],
       [1156.22236698],
       ...,
       [1156.22236698],
       [1156.22236698],
       [1156.22236698]])

In [107]:
mae_train_LR = mean_absolute_error(y_train, y_train_pred)
mse_train_LR = mean_squared_error(y_train, y_train_pred)

print(f'MAE train: {mae_train_LR}')
print(f'MSE train: {mse_train_LR}')

MAE train: 1024.8538890602715
MSE train: 25172722.73076022


In [108]:
mae_test_LR = mean_absolute_error(y_test, y_test_pred)
mse_test_LR = mean_squared_error(y_test, y_test_pred)

print(f'MAE train: {mae_test_LR}')
print(f'MSE train: {mse_test_LR}')

MAE train: 1035.449396771751
MSE train: 33732418.120576955


In [109]:
model_GB = GradientBoostingRegressor(max_depth=2, n_estimators=100)


In [110]:
model_GB.fit(X,y)

  y = column_or_1d(y, warn=True)


In [111]:
GB_pred_test = model_GB.predict(X_test)

In [112]:
GB_pred_train = model_GB.predict(X_train)

In [113]:
GB_mae_train = mean_absolute_error(y_train, GB_pred_train)
GB_mse_train = mean_squared_error(y_train, GB_pred_train)
r2_train = r2_score(y_train, GB_pred_train)

print(f'GB MAE train: {GB_mae_train}')
print(f'GB MSE train: {GB_mse_train}')
print(f'R2 train: {r2_train}')

GB MAE train: 1004.2240870935858
GB MSE train: 20409792.46203244
R2 train: 0.323248312696815


In [114]:
GB_mae_test = mean_absolute_error(y_test, GB_pred_test)
GB_mse_test = mean_squared_error(y_test, GB_pred_test)
r2_test = r2_score(y_test, GB_pred_test)

print(f'GB MAE test: {GB_mae_test}')
print(f'GB MSE test: {GB_mse_test}')
print(f'R2 test: {r2_test}')

GB MAE test: 1019.2751269221653
GB MSE test: 29225005.228095304
R2 test: 0.2851513308932191


RESULTATER GB M/ ESTIMATORS = 3 (FØR ADDED 0-VALUES)

GB MAE train: 1120.0629133333473
GB MSE train: 28565756.57911241

GB MAE test: 1116.13616235441
GB MSE test: 1116.13616235441



RESULTATER GB M/ ESTIMATORS = 10 (FØR ADDED 0-VALUES)

GB MAE train: 1064.926812397981
GB MSE train: 26888915.07355382

GB MAE test: 1065.5809749928003
GB MSE test: 1065.5809749928003

RESULTATER GB M/ ESTIMATORS = 100 (FØR ADDED 0-VALUES)

GB MAE train: 1004.2240870935858
GB MSE train: 20409792.46203244
R2 train: 0.323248312696815

GB MAE test: 1019.2751269221653
GB MSE test: 29225005.228095304
R2 test: 0.2851513308932191

In [121]:
df_original['date'].max()

'31.12.2014'

In [123]:
# Unique shops and items
unique_shops = 60
unique_items = 21807

# Date range
start_date = datetime(2013, 1, 1)
end_date = datetime(2015, 10, 31)

# Calculate total days in the date range
total_days = (end_date - start_date).days + 1  # +1 to include the end date

# Calculate total combinations
total_combinations = total_days * unique_shops * unique_items
total_combinations

1352906280

In [124]:
total_items_sold_per_shop = df_sub.groupby('shop_id')['item_cnt_day'].sum().sort_values(ascending=False)

In [126]:
top_10_shops = total_items_sold_per_shop.head(10)
top_10_shops

shop_id
31    310777.0
25    241920.0
54    185790.0
28    184557.0
42    144934.0
57    141107.0
27    136657.0
6     100489.0
58     81734.0
46     78990.0
Name: item_cnt_day, dtype: float64

In [129]:
total_items_sold_per_shop = df_sub.groupby('shop_id')['total_sales'].sum().sort_values(ascending=False)
total_items_sold_per_shop.head(10)

shop_id
31    2.352170e+08
25    2.164806e+08
28    1.597463e+08
42    1.519170e+08
54    1.419397e+08
27    1.406325e+08
57    1.129998e+08
12    1.122051e+08
6     9.806745e+07
18    8.339270e+07
Name: total_sales, dtype: float64

In [None]:
top_10_most_selling_shops = [31,25,28,42,54,27,57,12,6,18]

In [131]:
required_columns = ['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']
df_crossjoin = df_sub[required_columns]

In [132]:
# Create a DataFrame with all combinations
combinations = pd.DataFrame(product(df_crossjoin['date_block_num'].unique(), 
                                    df_crossjoin['shop_id'].unique(), 
                                    df_crossjoin['item_id'].unique()), 
                            columns=['date_block_num', 'shop_id', 'item_id'])

In [133]:
# Merge the combinations with the original data
merged_df = pd.merge(combinations, df_crossjoin, on=['date_block_num', 'shop_id', 'item_id'], how='left')

In [134]:
# Replace NaN in 'item_cnt_day' with 0
merged_df['item_cnt_day'] = merged_df['item_cnt_day'].fillna(0)