In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [33]:
# Load in the datasets
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
df_oil = pd.read_csv('../data/oil.csv')
df_holidays_events = pd.read_csv('../data/holidays_events.csv')
df_stores = pd.read_csv('../data/stores.csv')
df_transactions = pd.read_csv('../data/transactions.csv')

In [34]:
# Convert to datetime
df_train['date'] = pd.to_datetime(df_train['date'])
df_oil['date'] = pd.to_datetime(df_oil['date'])
df_holidays_events['date'] = pd.to_datetime(df_holidays_events['date'])
df_transactions['date'] = pd.to_datetime(df_transactions['date'])

In [35]:
df = pd.merge(df_train, df_transactions, on=['date', 'store_nbr'], how='left')
df = pd.merge(df, df_stores, on='store_nbr', how='left')
df = pd.merge(df, df_oil, on='date', how='left')
df = pd.merge(df, df_holidays_events, on='date', how='left')

In [36]:
pd.set_option('display.float_format', '{:.2f}'.format)

# Task 1. Data Exploration

## a. Explore the dataset by displaying the first few rows, summary statistics, and data types of each column.

### Display the first 5 rows

In [37]:
df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,transactions,city,state,type_x,cluster,dcoilwtico,type_y,locale,locale_name,description,transferred
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,,Quito,Pichincha,D,13,,Holiday,National,Ecuador,Primer dia del ano,False
1,1,2013-01-01,1,BABY CARE,0.0,0,,Quito,Pichincha,D,13,,Holiday,National,Ecuador,Primer dia del ano,False
2,2,2013-01-01,1,BEAUTY,0.0,0,,Quito,Pichincha,D,13,,Holiday,National,Ecuador,Primer dia del ano,False
3,3,2013-01-01,1,BEVERAGES,0.0,0,,Quito,Pichincha,D,13,,Holiday,National,Ecuador,Primer dia del ano,False
4,4,2013-01-01,1,BOOKS,0.0,0,,Quito,Pichincha,D,13,,Holiday,National,Ecuador,Primer dia del ano,False


### Display the summary statistics for numerical columns

In [None]:
df.describe()

### Display the data type of each column

In [None]:
df.dtypes

## b. Identify missing values, outliers, and unique values in categorical columns.

### Check for missing values

In [None]:
df.isnull().sum()

### Check for outliers
### Start with visualizing the data to get a picture of how the data looks in terms of distribution

In [None]:
non_zero_onpromotion = df[df['onpromotion'] > 0]

# Apply IQR method to non-zero values only
Q1 = non_zero_onpromotion['onpromotion'].quantile(0.25)
Q3 = non_zero_onpromotion['onpromotion'].quantile(0.75)
IQR = Q3 - Q1

# Outlier detection on non-zero values
outliers_onpromotion = non_zero_onpromotion[(non_zero_onpromotion['onpromotion'] < (Q1 - 1.5 * IQR)) | 
                                            (non_zero_onpromotion['onpromotion'] > (Q3 + 1.5 * IQR))]

# Display the outliers for non-zero values
print("Outliers in 'onpromotion' for non-zero values:")
print(outliers_onpromotion)

In [None]:
plt.figure(figsize=(12, 8))

# Boxplot for 'sales'
plt.subplot(2, 2, 1)
sns.boxplot(x=df['sales'])
plt.title('Boxplot of Sales')

# Histogram for 'sales'
plt.subplot(2, 2, 2)
sns.histplot(df['sales'], bins=100)
plt.title('Histogram of Sales')

# Boxplot for 'onpromotion'
plt.subplot(2, 2, 3)
sns.boxplot(x=df['onpromotion'])
plt.title('Boxplot of Onpromotion')

# Histogram for 'onpromotion'
plt.subplot(2, 2, 4)
sns.histplot(df['onpromotion'], bins=100)
plt.title('Histogram of Onpromotion')

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sample DataFrame
# df = pd.read_csv('your_data.csv')  # Load your DataFrame

# Function to clean data using IQR
def iqr_outlier_removal(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return column[(column >= lower_bound) & (column <= upper_bound)]

# Function to plot IQR results
def plot_iqr_results(df):
    # Apply IQR method
    sales_cleaned = iqr_outlier_removal(df['sales'])
    onpromotion_cleaned = iqr_outlier_removal(df['onpromotion'])

    # Create subplots
    plt.figure(figsize=(16, 10))

    # Sales before
    plt.subplot(2, 2, 1)
    sns.boxplot(x=df['sales'])
    plt.title('Boxplot of Sales Before Cleaning')
    
    plt.subplot(2, 2, 2)
    plt.hist(df['sales'], bins=50, color='blue', alpha=0.7)
    plt.title('Histogram of Sales Before Cleaning')

    # Sales after
    plt.subplot(2, 2, 3)
    sns.boxplot(x=sales_cleaned)
    plt.title('Boxplot of Sales After IQR Cleaning')
    
    plt.subplot(2, 2, 4)
    plt.hist(sales_cleaned, bins=50, color='green', alpha=0.7)
    plt.title('Histogram of Sales After IQR Cleaning')

    plt.tight_layout()
    plt.show()

    # Onpromotion
    plt.figure(figsize=(16, 10))

    # Onpromotion before
    plt.subplot(2, 2, 1)
    sns.boxplot(x=df['onpromotion'])
    plt.title('Boxplot of Onpromotion Before Cleaning')

    plt.subplot(2, 2, 2)
    plt.hist(df['onpromotion'], bins=50, color='blue', alpha=0.7)
    plt.title('Histogram of Onpromotion Before Cleaning')

    # Onpromotion after
    plt.subplot(2, 2, 3)
    sns.boxplot(x=onpromotion_cleaned)
    plt.title('Boxplot of Onpromotion After IQR Cleaning')

    plt.subplot(2, 2, 4)
    plt.hist(onpromotion_cleaned, bins=50, color='green', alpha=0.7)
    plt.title('Histogram of Onpromotion After IQR Cleaning')

    plt.tight_layout()
    plt.show()

# Call the function
plot_iqr_results(df)


In [None]:
from scipy import stats

# Function to clean data using Z-score
def z_score_outlier_removal(column, threshold=3):
    z_scores = np.abs(stats.zscore(column))
    return column[z_scores < threshold]

# Function to plot Z-score results
def plot_z_score_results(df):
    # Apply Z-score method
    sales_cleaned = z_score_outlier_removal(df['sales'])
    onpromotion_cleaned = z_score_outlier_removal(df['onpromotion'])

    # Create subplots
    plt.figure(figsize=(16, 10))

    # Sales before
    plt.subplot(2, 2, 1)
    sns.boxplot(x=df['sales'])
    plt.title('Boxplot of Sales Before Cleaning')
    
    plt.subplot(2, 2, 2)
    plt.hist(df['sales'], bins=50, color='blue', alpha=0.7)
    plt.title('Histogram of Sales Before Cleaning')

    # Sales after
    plt.subplot(2, 2, 3)
    sns.boxplot(x=sales_cleaned)
    plt.title('Boxplot of Sales After Z-score Cleaning')
    
    plt.subplot(2, 2, 4)
    plt.hist(sales_cleaned, bins=50, color='green', alpha=0.7)
    plt.title('Histogram of Sales After Z-score Cleaning')

    plt.tight_layout()
    plt.show()

    # Onpromotion
    plt.figure(figsize=(16, 10))

    # Onpromotion before
    plt.subplot(2, 2, 1)
    sns.boxplot(x=df['onpromotion'])
    plt.title('Boxplot of Onpromotion Before Cleaning')

    plt.subplot(2, 2, 2)
    plt.hist(df['onpromotion'], bins=50, color='blue', alpha=0.7)
    plt.title('Histogram of Onpromotion Before Cleaning')

    # Onpromotion after
    plt.subplot(2, 2, 3)
    sns.boxplot(x=onpromotion_cleaned)
    plt.title('Boxplot of Onpromotion After Z-score Cleaning')

    plt.subplot(2, 2, 4)
    plt.hist(onpromotion_cleaned, bins=50, color='green', alpha=0.7)
    plt.title('Histogram of Onpromotion After Z-score Cleaning')

    plt.tight_layout()
    plt.show()

# Call the function
plot_z_score_results(df)


In [None]:
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Function to clean data using Isolation Forest
def isolation_forest_outlier_removal(column):
    isolation_forest = IsolationForest(contamination=0.05, random_state=42)
    outliers = isolation_forest.fit_predict(column.values.reshape(-1, 1))
    return column[outliers == 1]  # Keep inliers

# Function to plot Isolation Forest results
def plot_isolation_forest_results(df):
    # Apply Isolation Forest method
    sales_cleaned = isolation_forest_outlier_removal(df['sales']).reset_index(drop=True)
    onpromotion_cleaned = isolation_forest_outlier_removal(df['onpromotion']).reset_index(drop=True)

    # Create subplots for Sales
    plt.figure(figsize=(16, 10))

    # Sales before
    plt.subplot(2, 2, 1)
    sns.boxplot(x=df['sales'])
    plt.title('Boxplot of Sales Before Cleaning')
    
    plt.subplot(2, 2, 2)
    plt.hist(df['sales'], bins=50, color='blue', alpha=0.7)
    plt.title('Histogram of Sales Before Cleaning')

    # Sales after
    plt.subplot(2, 2, 3)
    sns.boxplot(x=sales_cleaned)
    plt.title('Boxplot of Sales After Isolation Forest Cleaning')
    
    plt.subplot(2, 2, 4)
    plt.hist(sales_cleaned, bins=50, color='green', alpha=0.7)
    plt.title('Histogram of Sales After Isolation Forest Cleaning')

    plt.tight_layout()
    plt.show()

    # Create subplots for Onpromotion
    plt.figure(figsize=(16, 10))

    # Onpromotion before
    plt.subplot(2, 2, 1)
    sns.boxplot(x=df['onpromotion'])
    plt.title('Boxplot of Onpromotion Before Cleaning')

    plt.subplot(2, 2, 2)
    plt.hist(df['onpromotion'], bins=50, color='blue', alpha=0.7)
    plt.title('Histogram of Onpromotion Before Cleaning')

    # Onpromotion after
    plt.subplot(2, 2, 3)
    sns.boxplot(x=onpromotion_cleaned)
    plt.title('Boxplot of Onpromotion After Isolation Forest Cleaning')

    plt.subplot(2, 2, 4)
    plt.hist(onpromotion_cleaned, bins=50, color='green', alpha=0.7)
    plt.title('Histogram of Onpromotion After Isolation Forest Cleaning')

    plt.tight_layout()
    plt.show()

# Call the function
plot_isolation_forest_results(df)


In [None]:
from sklearn.neighbors import LocalOutlierFactor
import matplotlib.pyplot as plt
import seaborn as sns

# Function to clean data using Local Outlier Factor with optimizations
def local_outlier_factor_removal(column):
    # Using fewer neighbors and parallel processing
    lof = LocalOutlierFactor(n_neighbors=1, contamination=0.05, n_jobs=-1)
    outliers = lof.fit_predict(column.values.reshape(-1, 1))
    return column[outliers == 1]  # Keep inliers

# Function to plot LOF results
def plot_local_outlier_factor_results(df):
    # Apply Local Outlier Factor method
    sales_cleaned = local_outlier_factor_removal(df['sales']).reset_index(drop=True)
    onpromotion_cleaned = local_outlier_factor_removal(df['onpromotion']).reset_index(drop=True)

    # Create subplots for Sales
    plt.figure(figsize=(16, 10))

    # Sales before
    plt.subplot(2, 2, 1)
    sns.boxplot(x=df['sales'])
    plt.title('Boxplot of Sales Before Cleaning')
    
    plt.subplot(2, 2, 2)
    plt.hist(df['sales'], bins=50, color='blue', alpha=0.7)
    plt.title('Histogram of Sales Before Cleaning')

    # Sales after
    plt.subplot(2, 2, 3)
    sns.boxplot(x=sales_cleaned)
    plt.title('Boxplot of Sales After LOF Cleaning')
    
    plt.subplot(2, 2, 4)
    plt.hist(sales_cleaned, bins=50, color='green', alpha=0.7)
    plt.title('Histogram of Sales After LOF Cleaning')

    plt.tight_layout()
    plt.show()

    # Create subplots for Onpromotion
    plt.figure(figsize=(16, 10))

    # Onpromotion before
    plt.subplot(2, 2, 1)
    sns.boxplot(x=df['onpromotion'])
    plt.title('Boxplot of Onpromotion Before Cleaning')

    plt.subplot(2, 2, 2)
    plt.hist(df['onpromotion'], bins=50, color='blue', alpha=0.7)
    plt.title('Histogram of Onpromotion Before Cleaning')

    # Onpromotion after
    plt.subplot(2, 2, 3)
    sns.boxplot(x=onpromotion_cleaned)
    plt.title('Boxplot of Onpromotion After LOF Cleaning')

    plt.subplot(2, 2, 4)
    plt.hist(onpromotion_cleaned, bins=50, color='green', alpha=0.7)
    plt.title('Histogram of Onpromotion After LOF Cleaning')

    plt.tight_layout()
    plt.show()

# Call the function
plot_local_outlier_factor_results(df)
