### Inspecting DataFrame 

In [29]:
#Load libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

In [30]:
#Import sales dataset
sales = pd.read_csv("sales_subset.csv", sep=',', index_col="Unnamed: 0")
print(sales.head())

   store type  department        date  weekly_sales  is_holiday  \
0      1    A           1  2010-02-05      24924.50       False   
1      1    A           1  2010-03-05      21827.90       False   
2      1    A           1  2010-04-02      57258.43       False   
3      1    A           1  2010-05-07      17413.94       False   
4      1    A           1  2010-06-04      17558.09       False   

   temperature_c  fuel_price_usd_per_l  unemployment  
0       5.727778              0.679451         8.106  
1       8.055556              0.693452         8.106  
2      16.816667              0.718284         7.808  
3      22.527778              0.748928         7.808  
4      27.050000              0.714586         7.808  


In [31]:
# Print the info about the sales DataFrame
print(sales.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10774 entries, 0 to 10773
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   store                 10774 non-null  int64  
 1   type                  10774 non-null  object 
 2   department            10774 non-null  int64  
 3   date                  10774 non-null  object 
 4   weekly_sales          10774 non-null  float64
 5   is_holiday            10774 non-null  bool   
 6   temperature_c         10774 non-null  float64
 7   fuel_price_usd_per_l  10774 non-null  float64
 8   unemployment          10774 non-null  float64
dtypes: bool(1), float64(4), int64(2), object(2)
memory usage: 768.1+ KB
None


### Statistical Analysis

In [32]:
# Print the mean of weekly_sales
print("The mean of weekly_sales is {:.3f}".format(sales["weekly_sales"].mean()), end=("\n\n"))
print('--------'*5,end=("\n\n"))
# Print the median of weekly_sales
print("The median of weekly_sales is {:.3f}".format(sales["weekly_sales"].median()))


The mean of weekly_sales is 23843.950

----------------------------------------

The median of weekly_sales is 12049.065


In [33]:
# Convert 'date' column to datetime type
sales['date'] = pd.to_datetime(sales['date'])

# Print the maximum of the date column
print(f"The recent date is {sales['date'].max().strftime('%Y-%m-%d')}")

# Print the minimum of the date column
print(f"The oldest date is {sales['date'].min().strftime('%Y-%m-%d')}")

The recent date is 2012-10-26
The oldest date is 2010-02-05


In [41]:
# measure the difference between the first quartile (25th percentile) and the third quartile (75th percentile) of a temperature_c
#Method 1
per_25th = sales["temperature_c"].quantile(0.25)
print("First Quartile (25th percentile): {:.3f}".format(per_25th))

per_75th = sales["temperature_c"].quantile(0.75)
print("Third Quartile (75th percentile): {:.3f}".format(per_75th))

diff_per = per_75th - per_25th
print("Difference between quartiles (IQR): {:.3f}".format(diff_per))

First Quartile (25th percentile): 7.583
Third Quartile (75th percentile): 24.167
Difference between quartiles (IQR): 16.583


In [42]:
#Method 2
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)
    
# Print IQR of the temperature_c column
print(sales["temperature_c"].agg(iqr))

16.583333333333336


In [43]:
# A custom IQR function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr))

temperature_c           16.583333
fuel_price_usd_per_l     0.073176
unemployment             0.565000
dtype: float64


In [44]:
# Import NumPy and create custom IQR function
import numpy as np
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median]))

        temperature_c  fuel_price_usd_per_l  unemployment
iqr         16.583333              0.073176         0.565
median      16.966667              0.743381         8.099


In [54]:
# Sort sales_1_1 by date
sales_1_1 = sales.sort_values("date")
print(sales_1_1)

       store type  department       date  weekly_sales  is_holiday  \
0          1    A           1 2010-02-05      24924.50       False   
6437      19    A          13 2010-02-05      38597.52       False   
1249       2    A          31 2010-02-05       3840.21       False   
6449      19    A          14 2010-02-05      17590.59       False   
6461      19    A          16 2010-02-05       4929.87       False   
...      ...  ...         ...        ...           ...         ...   
3592       6    A          99 2012-10-05        440.00       False   
8108      20    A          99 2012-10-05        660.00       False   
10773     39    A          99 2012-10-05        915.00       False   
6257      14    A          96 2012-10-12          3.00       False   
3384       6    A          77 2012-10-26        -21.63       False   

       temperature_c  fuel_price_usd_per_l  unemployment  
0           5.727778              0.679451         8.106  
6437       -6.133333              0.78036

In [57]:
# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()
print(sales_1_1.head())

      store type  department       date  weekly_sales  is_holiday  \
0         1    A           1 2010-02-05      24924.50       False   
6437     19    A          13 2010-02-05      38597.52       False   
1249      2    A          31 2010-02-05       3840.21       False   
6449     19    A          14 2010-02-05      17590.59       False   
6461     19    A          16 2010-02-05       4929.87       False   

      temperature_c  fuel_price_usd_per_l  unemployment  cum_weekly_sales  \
0          5.727778              0.679451         8.106          24924.50   
6437      -6.133333              0.780365         8.350          63522.02   
1249       4.550000              0.679451         8.324          67362.23   
6449      -6.133333              0.780365         8.350          84952.82   
6461      -6.133333              0.780365         8.350          89882.69   

      cum_max_sales  
0          24924.50  
6437       38597.52  
1249       38597.52  
6449       38597.52  
6461       3

In [56]:
sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()

# See the columns you calculated
print(sales_1_1.head())


      store type  department       date  weekly_sales  is_holiday  \
0         1    A           1 2010-02-05      24924.50       False   
6437     19    A          13 2010-02-05      38597.52       False   
1249      2    A          31 2010-02-05       3840.21       False   
6449     19    A          14 2010-02-05      17590.59       False   
6461     19    A          16 2010-02-05       4929.87       False   

      temperature_c  fuel_price_usd_per_l  unemployment  cum_weekly_sales  \
0          5.727778              0.679451         8.106          24924.50   
6437      -6.133333              0.780365         8.350          63522.02   
1249       4.550000              0.679451         8.324          67362.23   
6449      -6.133333              0.780365         8.350          84952.82   
6461      -6.133333              0.780365         8.350          89882.69   

      cum_max_sales  
0          24924.50  
6437       38597.52  
1249       38597.52  
6449       38597.52  
6461       3

### Dropping duplicates

In [58]:
# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset=["store","type"])
print(store_types.head())

      store type  department       date  weekly_sales  is_holiday  \
0         1    A           1 2010-02-05      24924.50       False   
901       2    A           1 2010-02-05      35034.06       False   
1798      4    A           1 2010-02-05      38724.42       False   
2699      6    A           1 2010-02-05      25619.00       False   
3593     10    B           1 2010-02-05      40212.84       False   

      temperature_c  fuel_price_usd_per_l  unemployment  
0          5.727778              0.679451         8.106  
901        4.550000              0.679451         8.324  
1798       6.533333              0.686319         8.623  
2699       4.683333              0.679451         7.259  
3593      12.411111              0.782478         9.765  


In [59]:
# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset=["store","department"])
print(store_depts.head())

    store type  department       date  weekly_sales  is_holiday  \
0       1    A           1 2010-02-05      24924.50       False   
12      1    A           2 2010-02-05      50605.27       False   
24      1    A           3 2010-02-05      13740.12       False   
36      1    A           4 2010-02-05      39954.04       False   
48      1    A           5 2010-02-05      32229.38       False   

    temperature_c  fuel_price_usd_per_l  unemployment  
0        5.727778              0.679451         8.106  
12       5.727778              0.679451         8.106  
24       5.727778              0.679451         8.106  
36       5.727778              0.679451         8.106  
48       5.727778              0.679451         8.106  


In [60]:
# Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales["is_holiday"]==True].drop_duplicates(subset="date")
# Print date col of holiday_dates
print(holiday_dates)

      store type  department       date  weekly_sales  is_holiday  \
498       1    A          45 2010-09-10         11.47        True   
691       1    A          77 2011-11-25       1431.00        True   
2315      4    A          47 2010-02-12        498.00        True   
6735     19    A          39 2012-09-07         13.41        True   
6810     19    A          47 2010-12-31       -449.00        True   
6815     19    A          47 2012-02-10         15.00        True   
6820     19    A          48 2011-09-09        197.00        True   

      temperature_c  fuel_price_usd_per_l  unemployment  
498       25.938889              0.677602         7.787  
691       15.633333              0.854861         7.866  
2315      -1.755556              0.679715         8.623  
6735      22.333333              1.076766         8.193  
6810      -1.861111              0.881278         8.067  
6815       0.338889              1.010723         7.943  
6820      20.155556              1.038197