In [1]:
#enter here all those 'from .... import ....'


# numpy as np
try:
    import numpy as np
    print('NumPy already installed, only imported')
except:
    !pip install numpy
    import numpy as np
    print('NumPy was not installed, installed and imported')
# pandas as pd
try:
    import pandas as pd
    print('pandas already installed, only imported')
except:
    !pip install pandas
    import pandas as pd
    print('pandas was not installed, installed and imported')   
      
# pyplot as plt
try:
    import matplotlib.pyplot as plt
    print('PyPlot already installed, only imported')
except:
    !pip install matplotlib
    import matplotlib.pyplot as plt
    print('PyPlot was not installed, installed and imported')

# statsmodels as sm    
try:
    import statsmodels.api as sm
    print('statsmodels already installed, only imported')
except:
    !pip install statsmodels
    import statsmodels.api as sm
    print('statsmodels was not installed, installed and imported')    

NumPy already installed, only imported
pandas already installed, only imported
PyPlot already installed, only imported
statsmodels already installed, only imported


In [2]:
df = pd.read_csv('Walmart Data Analysis and Forcasting.csv')
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

# Stap 1: Data laden en voorbewerking
data = pd.read_csv("Walmart Data Analysis and Forcasting.csv")

# Selecteren van features en target
features = data.drop(["Weekly_Sales"], axis=1)
target = data["Weekly_Sales"]

# Opsplitsen van de dataset in trainings- en testset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Preprocessing pipelines voor numerieke en categorische features
numeric_features = ["Temperature", "Fuel_Price", "CPI", "Unemployment"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_features = ["Store", "Holiday_Flag"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Stap 2: ML-model opzetten in een pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

# Stap 3: Trainen en evalueren van het model
model.fit(X_train, y_train)

train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print("Trainingscore:", train_score)
print("Testscore:", test_score)

# Stap 4: Voorspellen van de verkoop voor de volgende week
next_week_data = pd.DataFrame({
    "Store": [1],
    "Date": ["02-06-2010"],
    "Holiday_Flag": [0],
    "Temperature": [44.5],
    "Fuel_Price": [2.75],
    "CPI": [211.42],
    "Unemployment": [8.0]
})

next_week_sales = model.predict(next_week_data)
print("Voorspelde verkoop voor volgende week:", next_week_sales)


Trainingscore: 0.9197415751696963
Testscore: 0.9208487115819616
Voorspelde verkoop voor volgende week: [1570114.74990685]


In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Stap 1: Data laden
data = pd.read_csv("Walmart Data Analysis and Forcasting.csv")

# Datumkolom omzetten naar datetime-formaat
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')

# Selecteren van features en target
features = data.drop(["Weekly_Sales"], axis=1)
target = data["Weekly_Sales"]

# Methode 1: Correlatie-analyse
correlation_scores = features.corrwith(target)
top_features_correlation = correlation_scores.abs().nlargest(3).index.tolist()

# Methode 2: Feature Importance Score
numeric_features = ["Temperature", "Fuel_Price", "CPI", "Unemployment"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_features = ["Store", "Holiday_Flag"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

model.fit(features, target)
feature_importances = model.named_steps["regressor"].feature_importances_
all_features = numeric_features + list(model.named_steps["preprocessor"].transformers_[1][1].get_feature_names_out(categorical_features))
top_features_importance = np.array(all_features)[np.argsort(feature_importances)[::-1][:3]].tolist()

# Methode 3: L1-regularisatie
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", Lasso(alpha=0.01, random_state=42))
])

model.fit(features, target)
coefficients = model.named_steps["regressor"].coef_
top_features_lasso = np.array(all_features)[np.argsort(np.abs(coefficients))[::-1][:3]].tolist()

print("Top 3 voorspellende kenmerken volgens correlatie-analyse:")
print(top_features_correlation)
print("\nTop 3 voorspellende kenmerken volgens feature importance score:")
print(top_features_importance)
print("\nTop 3 voorspellende kenmerken volgens L1-regularisatie:")
print(top_features_lasso)


Top 3 voorspellende kenmerken volgens correlatie-analyse:
['Store', 'Unemployment', 'CPI']

Top 3 voorspellende kenmerken volgens feature importance score:
['Store_20', 'Store_4', 'Store_14']

Top 3 voorspellende kenmerken volgens L1-regularisatie:
['Store_4', 'Store_13', 'Store_10']


  model = cd_fast.sparse_enet_coordinate_descent(


In [23]:
from sklearn.metrics import r2_score

predicted_sales = model.predict(X_test)
r2 = r2_score(y_test, predicted_sales)
print("R2-score: ", r2)

R2-score:  0.921981558055986


In [21]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, predicted_sales)
print("Mean Absolute Error (MAE): ", mae)

Mean Absolute Error (MAE):  90485.96067380511
