# Preliminaries

## Import Packages

In [26]:
#--Basics---------------
import pandas as pd
import numpy as np

#--Data Visualization----
import matplotlib.pyplot as plt
import seaborn as sns

#---Scikit-Learn--------
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, MaxAbsScaler, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression, Lasso
from sklearn.impute import SimpleImputer
from sklearn.utils import estimator_html_repr
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

#---Misc------------------
from mlxtend.feature_selection import SequentialFeatureSelector as MLSeqFeatSelector
from patsy import dmatrix
import statsmodels.api as sm

## Define Data

In [36]:
data = pd.read_csv(r"sales_data.csv")
data.to_csv("df.csv")
def numeric_categorical_attributes(data):
    numeric, categorical = (set(), set())
    for col in data.columns:
        try:
            data[col].apply(pd.to_numeric)
            numeric.add(col)
        except:
            categorical.add(col)
    return (numeric, categorical)

cols_numeric, cols_categorical = numeric_categorical_attributes(data)

# Data Cleaning

In [37]:
def clean_out_nulls(data, drop_cols, cols_numeric, cols_categorical):
    # Drop columns that have many nulls.
    result = data.drop(columns=drop_cols, axis=1)
    new_cols_numeric, new_cols_categorical = [cols.difference(drop_cols) for cols in [cols_numeric, cols_categorical]]
    # Impute missing values with the median of the respective column
    for col in new_cols_numeric:
        median_value = data[col].median()
        result[col] = data[col].fillna(median_value)  
    # Impute missing values with the mode of the respective column
    for col in new_cols_categorical:
        result[col] = data[col].fillna(data[col].mode()[0])
    
    return (result, new_cols_numeric, new_cols_categorical)

drop_cols = ['Order ID', 'Order Date']
data_1, cols_numeric, cols_categorical = clean_out_nulls(data, drop_cols, cols_numeric, cols_categorical)

In [39]:
def remake_column(data, cols_numeric, cols_categorical, col_number, newName, newValues = None, myFunc = None):
    oldName = data.columns[col_number]
    new_data = data.rename(columns = {oldName: newName})
    if newValues is not None:
        oldValues = np.sort(data.loc[:, oldName].unique())
        new_data[newName] = new_data[newName].map(dict(list(zip(oldValues, newValues))))
    else:
        new_data[newName] = new_data[newName].map(myFunc)
    if oldName in cols_numeric:
        cols_numeric = cols_numeric.difference({oldName}).union({newName})
    else:
        cols_categorical = cols_categorical.difference({oldName}).union({newName})
    
    return new_data, cols_numeric, cols_categorical

data_2, cols_numeric, cols_categorical = remake_column(data_1, cols_numeric, cols_categorical, 2, "Category", newValues = ["Groceries", "Sports", "Clothing", "Electronics"])
data_2, cols_numeric, cols_categorical = remake_column(data_2, cols_numeric, cols_categorical, 3, "Zip Code", myFunc = lambda str: str[-5:])
data_2


Unnamed: 0,Product,Product_ean,Category,Zip Code,Quantity Ordered,Price Each,Cost price,turnover,margin
0,iPhone,5.638009e+12,Clothing,02215,1,700.00,231.0000,700.00,469.0000
1,Lightning Charging Cable,5.563320e+12,Groceries,97035,1,14.95,7.4750,14.95,7.4750
2,Wired Headphones,2.113973e+12,Clothing,94016,2,11.99,5.9950,23.98,11.9900
3,27in FHD Monitor,3.069157e+12,Sports,90001,1,149.99,97.4935,149.99,52.4965
4,Wired Headphones,9.692681e+12,Electronics,73301,1,11.99,5.9950,11.99,5.9950
...,...,...,...,...,...,...,...,...,...
185945,Lightning Charging Cable,6.545974e+12,Electronics,94016,1,14.95,7.4750,14.95,7.4750
185946,AA Batteries (4-pack),5.352480e+12,Electronics,90001,2,3.84,1.9200,7.68,3.8400
185947,Vareebadd Phone,2.674213e+12,Groceries,98101,1,400.00,132.0000,400.00,268.0000
185948,Wired Headphones,5.216304e+12,Groceries,75001,1,11.99,5.9950,11.99,5.9950
