In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/processed/rossmann_sales_df.csv')  # Replace 'your_data.csv' with your file name
df = df.drop('Date', axis=1)
df.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,Day,Month,Quarter,...,MedianSales_LastWeek,MinSales_LastWeek,MaxSales_LastWeek,StdSales_LastWeek,TotalCustomers_LastWeek,MeanCustomers_LastWeek,MedianCustomers_LastWeek,MinCustomers_LastWeek,MaxCustomers_LastWeek,StdCustomers_LastWeek
0,1,4,4709,571,1,0,0,31,1,1,...,4601.0,0.0,5598.0,1951.6814,3592.0,513.142857,560.0,0.0,676.0,233.408389
1,1,5,5681,630,1,1,0,8,2,1,...,5970.0,0.0,7032.0,2340.568592,4167.0,595.285714,678.0,0.0,762.0,264.578731
2,1,6,5370,656,1,0,0,9,2,1,...,5970.0,0.0,7032.0,2342.155946,4139.0,591.285714,678.0,0.0,762.0,263.68271
3,1,7,0,0,0,0,0,10,2,1,...,5681.0,0.0,7032.0,2319.977258,4094.0,584.857143,675.0,0.0,762.0,261.097956
4,1,1,4409,599,1,0,0,11,2,1,...,5681.0,0.0,7032.0,2319.977258,4094.0,584.857143,675.0,0.0,762.0,261.097956


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 882294 entries, 0 to 882293
Data columns (total 52 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Store                      882294 non-null  int64  
 1   DayOfWeek                  882294 non-null  int64  
 2   Sales                      882294 non-null  int64  
 3   Customers                  882294 non-null  int64  
 4   Open                       882294 non-null  int64  
 5   Promo                      882294 non-null  int64  
 6   SchoolHoliday              882294 non-null  int64  
 7   Day                        882294 non-null  int64  
 8   Month                      882294 non-null  int64  
 9   Quarter                    882294 non-null  int64  
 10  CompetitionDistance        882294 non-null  float64
 11  CompetitionOpenSinceMonth  882294 non-null  float64
 12  CompetitionOpenSinceYear   882294 non-null  float64
 13  Promo2                     88

In [4]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Downcast int64 to int32 and float64 to float32
df[numerical_cols] = df[numerical_cols].astype({col: 'int32' if df[col].dtype == 'int64' else 'float32' for col in numerical_cols})

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 882294 entries, 0 to 882293
Data columns (total 52 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Store                      882294 non-null  int32  
 1   DayOfWeek                  882294 non-null  int32  
 2   Sales                      882294 non-null  int32  
 3   Customers                  882294 non-null  int32  
 4   Open                       882294 non-null  int32  
 5   Promo                      882294 non-null  int32  
 6   SchoolHoliday              882294 non-null  int32  
 7   Day                        882294 non-null  int32  
 8   Month                      882294 non-null  int32  
 9   Quarter                    882294 non-null  int32  
 10  CompetitionDistance        882294 non-null  float32
 11  CompetitionOpenSinceMonth  882294 non-null  float32
 12  CompetitionOpenSinceYear   882294 non-null  float32
 13  Promo2                     88

In [6]:
# Select relevant features
target = 'Sales'


# Split data into features (X) and target (y)
X = df.drop(target, axis=1)
y = df[target]

X, _, y, _ = train_test_split(X, y, test_size=0.5, random_state=42)

In [7]:
# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X, y)

# Get feature importances
feature_importances = rf_model.feature_importances_

# Create a DataFrame to visualize feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()


: 

In [7]:
# Train a Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X, y)

# Get coefficients (feature importance for Logistic Regression)
coefficients = lr_model.coef_[0]

# Create a DataFrame to visualize feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': coefficients})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance (Logistic Regression)')
plt.show()


: 

In [None]:
from sklearn.inspection import permutation_importance

# Calculate permutation importance
result = permutation_importance(
    rf_model, X, y, n_repeats=10, random_state=42
)

# Get feature importances
perm_importance = result.importances_mean

# Create a DataFrame to visualize permutation importance
perm_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': perm_importance})
perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)

# Plot permutation importance
plt.figure(figsize=(10, 6))
plt.barh(perm_importance_df['Feature'], perm_importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Permutation Importance')
plt.show()
