# getting the data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

pathtodata = r"C:\Users\user\Desktop\sml\train.csv"
df = pd.read_csv(pathtodata)

# Check for missing values
print(df.isnull().sum())

# Convert to datetime and sort
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# Drop the 'store' column it is not needed
df.drop('store', axis=1, inplace=True)

# Group by both 'item' and 'date', then sum sales
df = df.groupby(['item', 'date'])['sales'].sum().reset_index()

# Display first few rows
print(df.head())


plot sales over time

In [None]:
import matplotlib.pyplot as plt

# Plot sales over time
plt.figure(figsize=(12, 6))

t = df[df['item']==1] #only for item 1

print(t.shape)
start_date = '2016-01-01'  # Replace with your desired start date
end_date = '2016-02-01'    # Replace with your desired end date

# Filter test_item for the specific date range
tf = t[(t['date'] >= start_date) & (t['date'] < end_date)]

s = tf.groupby('date')['sales'].sum()
plt.scatter(tf['date'],s)
plt.title('Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.show()

In [None]:
# Group by 'item' and 'date', then sum the sales
sales = df.groupby(['item', 'date'])["sales"].sum().unstack() # unstack tuns it into a nice table put level = 0 if you want to inverse the rows and columns
sales.head()


ploat all item sales with respect to date

In [None]:

plt.figure(figsize=(12, 6))
#plot all sales with respect to an item
for j in sales.index :
    #the columns (dates) are on the x axis, and the sales (values of each row) are on the y axis
    plt.plot(sales.columns, sales.loc[j])
    plt.show()

select he items you would like to plot

In [None]:

plt.figure(figsize=(12, 6))
#plot some sales with respect to an item
for j in [1,2,3,4,6] :
    #the columns (dates) are on the x axis, and the sales (values of each row) are on the y axis
    plt.plot(sales.columns, sales.loc[j]) # loc[j, none] this means all values on thar row
    plt.show()


# add features so the model learns better

the lag feature

In [None]:
# Ensure data is sorted by 'item' and 'date'
df = df.sort_values(by=['item', 'date'])

# Create a 7-day lag for sales for each item
df['lag_7'] = df.groupby('item')['sales'].shift(7)
df = df.dropna()
df.head()

date columns

In [None]:
df['date'] = pd.to_datetime(df['date'])  # Ensure date is in datetime format

df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['week'] = df['date'].dt.isocalendar().week
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek  # Monday = 0, Sunday = 6
#applies custom function
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)  # 1 if weekend, else 0

print(df.head())  # Check the new features


rolling mean

In [None]:
df = df.sort_values(by=['item','date'])

df['rolling_mean_7'] = df.groupby('item')['sales'].transform(lambda x: x.rolling(7, min_periods=1).mean())
df['rolling_mean_30'] = df.groupby('item')['sales'].transform(lambda x: x.rolling(30, min_periods=1).mean())

print(df[['sales', 'rolling_mean_7', 'rolling_mean_30']].head(10))
#the rolling mean, imagine a wheel that enter the top of column step by step, each step it will a roll over one and calculates the mean of the rolledover 
#values, the one values leaves the surface of the wheel it is not accounted for in the calculations


one hot encoding

In [None]:
t = df['item']

In [None]:

df = pd.get_dummies(df, columns=['item'], drop_first=True)  # One-hot encoding
df.head()


reduce outliers effect

In [None]:
import numpy as np
df['sales'] = np.log1p(df['sales'])  # Apply log(1 + x) transformation


split dataset

In [None]:
df = df.sort_values('date')
df.dropna()
train_size = int(len(df) * 0.8)
train = df.iloc[:train_size]
test = df.iloc[train_size:]

# Define feature columns and target variable
feature_cols = [col for col in df.columns if col not in ['sales', 'date']]
X_train = train[feature_cols]
y_train = train['sales']
X_test = test[feature_cols]
y_test = test['sales']



# the models

linear regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'Linear Regression RMSE: {rmse}')



evaluation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Check column names to find the correct one-hot encoded item column
print(test.columns)

# Choose an item from the one-hot encoded columns
item_column = 'item_2'  # Replace with the correct column name after checking

# Filter the test set for the specific item
test_item = test[test[item_column] == 1]

# Ensure 'date' is in datetime format
test_item = test_item.copy()  # Avoid SettingWithCopyWarning
test_item['date'] = pd.to_datetime(test_item['date'])

# Get actual and predicted sales for the selected item
y_test_item = y_test.loc[test_item.index]
y_pred_series = pd.Series(y_pred, index=test.index)  # Ensure proper alignment
y_pred_item = y_pred_series.loc[test_item.index]

# Plot Actual vs Predicted Sales for the specific item
plt.figure(figsize=(12, 6))
plt.plot(test_item['date'], y_test_item, label='Actual Sales', color='orange')
plt.plot(test_item['date'], y_pred_item, label='Predicted Sales', color='blue', linestyle='--')
plt.xlabel("Date")
plt.ylabel("Sales")
plt.title(f"Actual vs Predicted Sales for {item_column}")
plt.legend()
plt.xticks(rotation=45)
plt.show()


In [None]:
test_item.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Define the time range for filtering
start_date = '2017-01-01'  # Replace with your desired start date
end_date = '2017-02-01'    # Replace with your desired end date

# Filter test_item for the specific date range
test_item_filtered = test_item[(test_item['date'] >= start_date) & (test_item['date'] < end_date)]

# Filter y_test and y_pred accordingly
y_test_filtered = y_test.loc[test_item_filtered.index]
y_pred_filtered = y_pred_series.loc[test_item_filtered.index]

# Plot Actual vs Predicted Sales for the selected item within the date range
plt.figure(figsize=(12, 6))
plt.scatter(test_item_filtered['date'], y_test_filtered, label='Actual Sales', color='orange')
plt.scatter(test_item_filtered['date'], y_pred_filtered, label='Predicted Sales', color='blue', linestyle='--')
plt.xlabel("Date")
plt.ylabel("Sales")
plt.title(f"Actual vs Predicted Sales for {item_column} ({start_date} to {end_date})")
plt.legend()
plt.xticks(rotation=45)
plt.show()


In [None]:
import numpy as np

# Apply inverse transformation
y_test_filtered = np.expm1(y_test_filtered)
y_pred_filtered = np.expm1(y_pred_filtered)

# Plot Actual vs Predicted Sales for the selected item within the date range
plt.figure(figsize=(12, 6))
plt.scatter(test_item_filtered['date'], y_test_filtered, label='Actual Sales', color='orange')
plt.scatter(test_item_filtered['date'], y_pred_filtered, label='Predicted Sales', color='blue', linestyle='--')
plt.xlabel("Date")
plt.ylabel("Sales")
plt.title(f"Actual vs Predicted Sales for {item_column} ({start_date} to {end_date})")
plt.legend()
plt.xticks(rotation=45)
plt.show()


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np


In [None]:
# Initialize the model.
# 'reg:squarederror' is used as the objective for regression.
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,      # Number of trees
    learning_rate=0.1,     # Step size shrinkage to prevent overfitting
    max_depth=6,           # Maximum depth of each tree
    random_state=42        # For reproducibility
)

# Train the model on the training data.
xgb_model.fit(X_train, y_train)


In [None]:
y_pred_xgb = xgb_model.predict(X_test)


In [None]:
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
print(f'XGBoost RMSE: {rmse_xgb}')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assume y_pred_xgb and X_test are already defined from XGBoost predictions
y_pred_xgb_series = pd.Series(y_pred_xgb, index=X_test.index)

# Define the time range for visualization (adjust start_date and end_date as needed)
start_date = '2017-01-01'
end_date = '2017-03-01'

# Choose an item from the one-hot encoded columns
item_column = 'item_3'  # Replace with the correct column name after checking

# Filter the test set for the specific item AND the date range:
test_item = test[(test[item_column] == 1) & 
                 (test['date'] >= start_date) & 
                 (test['date'] < end_date)]

# Get actual and predicted sales for the selected item
y_test_item = y_test.loc[test_item.index]
y_pred_item = y_pred_xgb_series.loc[test_item.index]

# Apply the inverse transformation (once) to convert back to original scale
y_test_item = np.expm1(y_test_item)
y_pred_item = np.expm1(y_pred_item)

# Plot Actual vs. Predicted Sales for the specific item within the date range
plt.figure(figsize=(12, 6))
plt.plot(test_item['date'], y_test_item, label='Actual Sales', color='orange')
plt.plot(test_item['date'], y_pred_item, label='Predicted Sales', color='blue', linestyle='--')
plt.xlabel("Date")
plt.ylabel("Sales")
plt.title(f"XGBoost: Actual vs. Predicted Sales for {item_column} ({start_date} to {end_date})")
plt.legend()
plt.xticks(rotation=45)
plt.show()
