In [27]:
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
import plotly.graph_objects as go
from statsmodels.stats.outliers_influence import OLSInfluence


In [4]:
def load_csv_to_dataframe(file_path):
    """Load a CSV file into a Pandas DataFrame."""
    try:
        df = pd.read_csv(file_path, encoding='latin1')
        print(f"Successfully loaded data from {file_path}")
        print(df.head())
        return df
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except pd.errors.EmptyDataError:
        print(f"Error: The file {file_path} is empty.")
    except pd.errors.ParserError:
        print(f"Error: The file {file_path} could not be parsed.")
    except UnicodeDecodeError as e:
        print(f"Encoding error while reading the file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [5]:
if __name__ == "__main__":
    file_path = 'sales_data_sample.csv'
    df = load_csv_to_dataframe(file_path)

Successfully loaded data from sales_data_sample.csv
   ORDERNUMBER  QUANTITYORDERED  PRICEEACH  ORDERLINENUMBER    SALES  \
0        10107               30      95.70                2  2871.00   
1        10121               34      81.35                5  2765.90   
2        10134               41      94.74                2  3884.34   
3        10145               45      83.26                6  3746.70   
4        10159               49     100.00               14  5205.27   

         ORDERDATE   STATUS  QTR_ID  MONTH_ID  YEAR_ID  ...  \
0   2/24/2003 0:00  Shipped       1         2     2003  ...   
1    5/7/2003 0:00  Shipped       2         5     2003  ...   
2    7/1/2003 0:00  Shipped       3         7     2003  ...   
3   8/25/2003 0:00  Shipped       3         8     2003  ...   
4  10/10/2003 0:00  Shipped       4        10     2003  ...   

                    ADDRESSLINE1  ADDRESSLINE2           CITY STATE  \
0        897 Long Airport Avenue           NaN            NYC    

In [15]:
# Fit the model
model = sm.OLS(df['SALES'], sm.add_constant(df[['PRICEEACH', 'QUANTITYORDERED']])).fit()

# Get the influence measures
influence = OLSInfluence(model)

# Get the DFFITS values
dffits = influence.dffits[0]
print(dffits)



0      -0.022117
1      -0.010745
2      -0.024088
3      -0.022837
4      -0.029082
          ...   
2818   -0.028878
2819    0.002231
2820    0.002037
2821   -0.001276
2822   -0.022455
Length: 2823, dtype: float64


In [18]:
# Identify the outliers

outliers_dffits = df[dffits > 2 * dffits.std()]

print("Outliers based on DFFITS:\n", outliers_dffits)

Outliers based on DFFITS:
       ORDERNUMBER  QUANTITYORDERED  PRICEEACH  ORDERLINENUMBER     SALES  \
20          10341               41     100.00                9   7737.93   
22          10375               21      34.91               12    733.11   
27          10112               29     100.00                1   7209.11   
30          10150               45     100.00                8  10993.50   
32          10174               34     100.00                4   8014.82   
...           ...              ...        ...              ...       ...   
2295        10324               48     100.00                4   8209.44   
2356        10156               20      41.02                1    820.40   
2430        10395               45     100.00                3   8977.05   
2505        10388               46     100.00                2  10066.60   
2634        10336               46     100.00                2   9558.80   

            ORDERDATE   STATUS  QTR_ID  MONTH_ID  YEAR_ID  .

In [30]:
# Plotting DFFITS
fig_dffits = px.scatter(x=dffits, y=model.resid, title='DFFITS vs Residuals', labels={'x': 'DFFITS', 'y': 'Residuals'}) 
fig_dffits.add_scatter(x=dffits, y=[0]*len(dffits), mode='lines', name='Zero Line', line=dict(color='red', dash='dash'))
fig_dffits_outliers_bounds.add_scatter(x=dffits[abs(dffits) > 2 * dffits.std()], y=model.resid[abs(dffits) > 2 * dffits.std()], mode='markers', name='Outliers', marker=dict(color='orange', size=10))
fig_dffits.show()

In [20]:
# Plotting DFFITS with outliers highlighted     
fig_dffits_outliers = px.scatter(x=dffits, y=model.resid, title='DFFITS vs Residuals with Outliers Highlighted', labels={'x': 'DFFITS', 'y': 'Residuals'})
fig_dffits_outliers.add_scatter(x=dffits, y=[0]*len(dffits), mode='lines', name='Zero Line', line=dict(color='red', dash='dash'))
fig_dffits_outliers.add_scatter(x=outliers_dffits['PRICEEACH'], y=outliers_dffits['SALES'], mode='markers', name='Outliers', marker=dict(color='orange', size=10))
fig_dffits_outliers.show()

In [21]:
# Plotting DFFITS with lower and upper bounds   
lower_bound_dffits = dffits.mean() - 2 * dffits.std()
upper_bound_dffits = dffits.mean() + 2 * dffits.std()
fig_dffits_bounds = px.scatter(x=dffits, y=model.resid, title='DFFITS vs Residuals with Bounds', labels={'x': 'DFFITS', 'y': 'Residuals'})
fig_dffits_bounds.add_scatter(x=dffits, y=[0]*len(dffits), mode='lines', name='Zero Line', line=dict(color='red', dash='dash'))
fig_dffits_bounds.add_scatter(x=dffits, y=[lower_bound_dffits]*len(dffits), mode='lines', name='Lower Bound', line=dict(color='blue', dash='dash'))
fig_dffits_bounds.add_scatter(x=dffits, y=[upper_bound_dffits]*len(dffits), mode='lines', name='Upper Bound', line=dict(color='green', dash='dash'))
fig_dffits_bounds.show()

In [32]:
# Plotting DFFITS with outliers highlighted and lower and upper bounds
fig_dffits_outliers_bounds = px.scatter(x=dffits, y=model.resid, title='DFFITS vs Residuals with Outliers Highlighted and Bounds', labels={'x': 'DFFITS', 'y': 'Residuals'})
fig_dffits_outliers_bounds.add_scatter(x=dffits, y=[0]*len(dffits), mode='lines', name='Zero Line', line=dict(color='red', dash='dash'))
fig_dffits_outliers_bounds.add_scatter(x=dffits, y=[lower_bound_dffits]*len(dffits), mode='lines', name='Lower Bound', line=dict(color='blue', dash='dash'))
fig_dffits_outliers_bounds.add_scatter(x=dffits, y=[upper_bound_dffits]*len(dffits), mode='lines', name='Upper Bound', line=dict(color='green', dash='dash'))
# Highlight outliers on the same axes (DFFITS vs Residuals)
fig_dffits_outliers_bounds.add_scatter(x=dffits[abs(dffits) > 2 * dffits.std()], y=model.resid[abs(dffits) > 2 * dffits.std()], mode='markers', name='Outliers', marker=dict(color='orange', size=10))
fig_dffits_outliers_bounds.show()