In [1]:
import pandas as pd
import re
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import squarify
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso



In [2]:
#Function สำหรับการเช็คค่าทางสถิติและดูภาพรวมของข้อมูลเพื่อทำความเข้าใจข้อมูลให้มากยิ่งขึ้น
def summarize_dataframe(df):
    summary_data = []

    for column in df.columns:
        data_type = df[column].dtype
        unique_values_count = df[column].nunique()
        value_counts = df[column].value_counts()
        missing_values_count = df[column].isnull().sum()
        
        # คำนวณ Min, Max, Mean และแปลงค่าเป็นเลขฐาน 10
        if pd.api.types.is_numeric_dtype(df[column]):
            min_value = round(df[column].min())
            max_value = round(df[column].max())
            mean_value = round(df[column].mean())
            median_value = round(df[column].median())
            std_value = round(df[column].std())

            # สร้างคอลัมน์ Range
            range_value = f"{min_value:.10g} - {max_value:.10g}"
            
            # แปลง Mean ให้อยู่ในเลขฐาน 10
            mean_value = f"{mean_value:.10g}"
            
            # แปลง Mean ให้อยู่ในเลขฐาน 10
            median_value = f"{median_value:.10g}"
            
            # แปลง Mean ให้อยู่ในเลขฐาน 10
            std_value = f"{std_value:.10g}"

        else:
            range_value = None
            mean_value = None
            median_value = None
            std_value = None
        
        # คำนวณ Outliers
        if pd.api.types.is_numeric_dtype(df[column]):
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            count_lower_outliers = df[df[column] < lower_bound].shape[0]
            count_upper_outliers = df[df[column] > upper_bound].shape[0]
        else:
            count_lower_outliers = None
            count_upper_outliers = None

        # ตัวอย่างข้อมูลในแต่ละค่า
        examples = value_counts.index.tolist()[:5]
        
        # นับจำนวนในแต่ละค่า
        #counts = value_counts.tolist()[:2]
        
        summary_data.append({
            'Attribute': column,
            'Data Type': data_type,
            'Row' : len(df),
            'Unique Values': unique_values_count,
            'Missing Values': missing_values_count,
            'Range (Min - Max)': range_value,
            'Mean': mean_value,
            'Medium' : median_value,
            'STD' : std_value,
            'Lower Outliers Count': count_lower_outliers,
            'Upper Outliers Count': count_upper_outliers,
            'Values': examples,
            #'Counts': counts
        })
    
    summary_df = pd.DataFrame(summary_data)
    
    return summary_df

#ตรวจสอบความถูกต้องและสอดคล้องของข้อมูล จะคืนค่าเป็น records ที่ไม่สอดคล้อง
def get_non_matching_records(df1, attrA, df2, attrB):
    non_matching = df1[~df1[attrA].isin(df2[attrB])]
    
    return non_matching

#หาpattern ข้อมูลค่าใน Attr และหา Error
def detect_patterns(df, column):
    # เก็บรูปแบบที่พบในคอลัมน์
    patterns = []

    for value in df[column].dropna():
        # แปลงค่าทั้งหมดให้เป็นสตริง
        value_str = str(value)
        
        # หารูปแบบทั่วไปโดยการแทนที่ตัวเลขด้วย "D" และตัวอักษรด้วย "A"
        pattern = re.sub(r'\d', 'D', value_str)
        pattern = re.sub(r'[a-zA-Z]', 'A', pattern)
        pattern = re.sub(r'\s+', ' ', pattern)  # ลบช่องว่างที่เกินมา
        
        patterns.append(pattern)
    
    # นับความถี่ของแต่ละรูปแบบ
    pattern_counts = Counter(patterns)
    
    # สร้าง DataFrame ที่มีคอลัมน์ 'patterns' และ 'count'
    pattern_df = pd.DataFrame(pattern_counts.items(), columns=['patterns', 'count'])
    
    return pattern_df


#คืน records ที่มีต่าตรกับ Pattern ที่ใส่เข้าไป
def filter_by_pattern(df, column, pattern):
    # สร้างฟังก์ชันภายในเพื่อแปลงค่าของคอลัมน์เป็นรูปแบบเดียวกันกับ detect_patterns
    def revert_pattern(value):
        value_str = str(value)
        revert_pattern = re.sub(r'\d', 'D', value_str)
        revert_pattern = re.sub(r'[a-zA-Z]', 'A', revert_pattern)
        revert_pattern = re.sub(r'\s+', ' ', revert_pattern)
        return revert_pattern
    
    # กรอง records ที่มี pattern ตรงกับที่กำหนด
    matching_records = df[df[column].apply(revert_pattern) == pattern]
    
    return matching_records


def convert_time_of_transaction(df, column):
    # แปลงค่า time_of_transaction เป็นรูปแบบเวลา HH:MM
    df[column] = df[column].apply(lambda x: f"{int(x // 100):02}:{int(x % 100):02}")
    return df

# Data Collection

In [3]:
#Load Data
causal = pd.read_csv('dh_causal_lookup.csv')
#product= pd.read_csv('111.csv')
store = pd.read_csv('dh_store_lookup.csv')
transactions = pd.read_csv('dh_transactions.csv')
product= pd.read_csv('222.csv')

In [4]:
product

Unnamed: 0,upc,product_description,commodity,brand,product_size,brand_commodity,brand_commodity_unique,brand_commodity_size,brand_commodity_size_unique
0,111112360,VINCENT S ORIG MARINARA S,pasta sauce,Vincent's,25 OZ,Vincent's pasta sauce,brand_commodity,Vincent's pasta sauce 25 OZ,brand_commodity_size
1,566300023,PINE MOUNTAIN SYRUP,syrups,Pine Mountain,40 OZ,Pine Mountain syrups,Vincent's pasta sauce,Pine Mountain syrups 40 OZ,Vincent's pasta sauce 25 OZ
2,566300028,MILLER CANE SYRUP,syrups,Miller,19 OZ,Miller syrups,Pine Mountain syrups,Miller syrups 19 OZ,Pine Mountain syrups 40 OZ
3,566300029,MILLER CANE SYRUP,syrups,Miller,12 OZ,Miller syrups,Miller syrups,Miller syrups 12 OZ,Miller syrups 19 OZ
4,566300035,PINE MOUNTAIN SYRUP,syrups,Pine Mountain,19 OZ,Pine Mountain syrups,Barilla pasta sauce,Pine Mountain syrups 19 OZ,Miller syrups 12 OZ
...,...,...,...,...,...,...,...,...,...
922,9999985217,PRIVATE LABEL ALPHABETS,pasta,Private Label,16 OZ,Private Label pasta,,Private Label pasta 16 OZ,
923,9999985260,PRIVATE LABEL COMPLETE PANCAKE MIX,pancake mixes,Private Label,32 OZ,Private Label pancake mixes,,Private Label pancake mixes 32 OZ,
924,9999985261,PRIVATE LABEL COMPLETE PANCAKE MIX,pancake mixes,Private Label,2 LB,Private Label pancake mixes,,Private Label pancake mixes 2 LB,
925,9999985488,PRIVATE LABEL ITAL NESTED ANGEL HAIR,pasta,Private Label Premium,16 OZ,Private Label Premium pasta,,Private Label Premium pasta 16 OZ,


In [5]:
transactions_m = transactions.merge(product[['upc', 'brand_commodity']], on='upc', how='left')
transactions_m

Unnamed: 0,upc,dollar_sales,units,time_of_transaction,geography,week,household,store,basket,day,coupon,brand_commodity
0,7680850106,0.80,1,1100,2,1,125434,244,1,1,0,Barilla pasta
1,3620000470,3.59,1,1100,2,1,125434,244,1,1,0,Bertolli pasta sauce
2,1800028064,2.25,1,1137,2,1,108320,244,2,1,0,Hungry Jack pancake mixes
3,9999985067,0.85,1,1148,2,1,162016,244,3,1,0,Private Label pasta
4,9999985131,2.19,1,1323,2,1,89437,244,4,1,0,Private Label Premium pasta
...,...,...,...,...,...,...,...,...,...,...,...,...
5197676,9999985001,0.39,1,2354,1,104,435465,199,3316346,728,0,Private Label pasta
5197677,9999966720,1.05,1,2354,1,104,435465,199,3316346,728,0,Private Label pasta sauce
5197678,9999985027,0.99,1,2311,1,104,352690,218,3316347,728,0,Private Label pasta
5197679,3620000300,1.53,1,2245,1,104,55530,93,3316348,728,0,Ragu pasta sauce


In [6]:
causal_summarize = summarize_dataframe(causal)
product_summarize = summarize_dataframe(product)
store_summarize = summarize_dataframe(store)
transactions_summarize = summarize_dataframe(transactions)

In [7]:
#causal_summarize
#product_summarize
#store_summarize
#transactions_summarize

In [8]:
#transactions = transactions[(transactions["upc"] > 9000000000)]
transactions_m = transactions_m[(transactions_m["store"] == 270)]
transactions_m

Unnamed: 0,upc,dollar_sales,units,time_of_transaction,geography,week,household,store,basket,day,coupon,brand_commodity
999,7680851917,0.80,1,17,2,1,389021,270,642,1,0,Barilla pasta
1000,1510000163,3.57,3,918,2,1,125111,270,643,1,0,Creamette pasta
1001,3620001397,2.50,1,918,2,1,125111,270,643,1,0,Ragu pasta sauce
1002,3620000050,1.49,1,1025,2,1,420298,270,644,1,0,Ragu pasta sauce
1003,9999985054,0.75,1,1109,2,1,144820,270,645,1,0,Private Label Premium pasta
...,...,...,...,...,...,...,...,...,...,...,...,...
5181352,5100001214,1.67,1,1859,2,104,439587,270,3306316,728,0,Prego pasta sauce
5181353,1510000043,0.80,1,2055,2,104,125416,270,3306317,728,0,Creamette pasta
5181354,9999985070,0.67,1,2215,2,104,142793,270,3306318,728,0,Private Label pasta
5188742,9999985020,0.50,1,2239,2,104,414440,270,3310882,728,0,Private Label pasta


# feature engineering

In [9]:
transactions_bu = transactions_m[["basket", "brand_commodity"]]
transactions_bu = transactions_bu.astype('str')
transactions_bu

Unnamed: 0,basket,brand_commodity
999,642,Barilla pasta
1000,643,Creamette pasta
1001,643,Ragu pasta sauce
1002,644,Ragu pasta sauce
1003,645,Private Label Premium pasta
...,...,...
5181352,3306316,Prego pasta sauce
5181353,3306317,Creamette pasta
5181354,3306318,Private Label pasta
5188742,3310882,Private Label pasta


# Model

In [13]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

df = transactions_bu

# แปลงข้อมูลให้อยู่ในรูปแบบ transactions (กลุ่มสินค้าที่ซื้อในแต่ละตะกร้า)
grouped_data = df.groupby('basket')['brand_commodity'].apply(list)

# ใช้ TransactionEncoder แปลงข้อมูลให้พร้อมสำหรับ Apriori
te = TransactionEncoder()
te_ary = te.fit(grouped_data).transform(grouped_data)
df_trans = pd.DataFrame(te_ary, columns=te.columns_)

# สร้าง frequent itemsets ด้วย Apriori Algorithm
frequent_itemsets = apriori(df_trans, min_support=0.01, use_colnames=True)

# สร้าง association rules และคำนวณ Support, Confidence, และ Lift
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Ragu pasta sauce),(Creamette pasta),0.158766,0.065963,0.01201,0.075645,1.146774,0.001537,1.010474
1,(Creamette pasta),(Ragu pasta sauce),0.065963,0.158766,0.01201,0.182069,1.146774,0.001537,1.02849
2,(Private Label pasta),(Private Label pasta sauce),0.265035,0.033709,0.0111,0.041881,1.24242,0.002166,1.008529
3,(Private Label pasta sauce),(Private Label pasta),0.033709,0.265035,0.0111,0.329285,1.24242,0.002166,1.095793
