In [45]:
# Frequent Patterns: Combinations of Features that occur frequently in the dataset

# Apriori Algorithm
# FP-Growth Algorithm
# We can realize more about relationships between different features

# Steps:
#      Pre-Process, missing values, remove irrelevant
#      Encode Dataset, each row a specific format of accident ( ex binary ) and each columnt a featuer
#      Choose sutable FP-Growth, Apriori Algorithm
#      Setting Minimum Support Threshold: define the minimum occurance
#      Extract Patterns: run algorithm, filter result that support threshold, 
#                      => output: list of frequent itemsets or patterns, combination of features occur frequently
#      Post-Processing, Analysis: analyze extracted patterns, Explore Association Rules
#      Interpret and Visulization


In [46]:
# Frequent Item se in a dataset

# Frequent Itemset = Association Rules
# A frequent itemset = A set of items that occur together frequently in dataset
# Support Count = Number of occurance that set in dataset

# Association Rule Mining Algoirhtms = such as : Apriori, FP-Growth -> output = frequent itemset
# they are iteratively generation algorithm -> output : {left-hand set} -> {candidate set}
# confidence = Number of occurance candidate / Number of occurance left-hand set
# these algorithms work by on-off of an item

# applications: in cross-selling and recommendation systems

# it’s good to put them together in stores or provide some discount offer on 
# one item on purchase of another item


In [79]:
# Defenitions:

# Support = ex 5% support means 5% of all transactions follow this rule
# Support ( A -> B ) = Support ( people who buy A and B ) / Support_count ( All Transactions )

# Confidence ( A -> B ) = Support_count ( A U B ) / Support_count ( A )
# ex Confidence ( A -> B ) = 60% means: 
#     Support_count ( people who buy A and B ) / Support_count ( people who buy A )

# If a rule satisfy both minimum support and minimum confidence its a strong rule


# Support_count ( X ) : Number of transactions that X occur 
# Support_count ( X U Y ) : Number of transactions that X and Y occur
    
# Maximal Itemset : an itemset is maximal if none of its superset are frequent

# Closed Itemset : none of its immediate supersets have same support count same as itemset


# PROS:
#     efficient
#     easy to interpret
#     can be used in wide range of application: finance, healthcare
#     handing large dataset

# CONS:
#     large number of rules
#     limited in detecting complex relationships
#     computationally expensive
#     need to define minimum support and confidence threshold
    


In [80]:
from pathlib import Path
import pandas as pd

In [81]:
# Load Dataset

abs_path = Path.cwd()
data_address = abs_path.parent.parent / 'data/' / 'Cleaned_US.csv'
df = pd.read_csv(data_address)

In [82]:
# pip install mlxtend

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [83]:
# One-Shot Encoding on dataset

te = TransactionEncoder()
te

TransactionEncoder()

In [84]:
df

Unnamed: 0,Severity,Start_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),City,County,State,...,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Weekday,Hour,Month
0,3,2016-02-08 00:37:08,40.108910,-83.092860,40.112060,-83.031870,3.230,Dublin,Franklin,OH,...,False,False,False,Night,Night,Night,Night,Monday,0,2
1,2,2016-02-08 05:56:20,39.865420,-84.062800,39.865010,-84.048730,0.747,Dayton,Montgomery,OH,...,False,False,False,Night,Night,Night,Night,Monday,5,2
2,2,2016-02-08 06:15:39,39.102660,-84.524680,39.102090,-84.523960,0.055,Cincinnati,Hamilton,OH,...,False,False,False,Night,Night,Night,Day,Monday,6,2
3,2,2016-02-08 06:51:45,41.062130,-81.537840,41.062170,-81.535470,0.123,Akron,Summit,OH,...,False,False,False,Night,Night,Day,Day,Monday,6,2
4,3,2016-02-08 07:53:43,39.172393,-84.492792,39.170476,-84.501798,0.500,Cincinnati,Hamilton,OH,...,False,False,False,Day,Day,Day,Day,Monday,7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774570,2,2021-07-19 16:15:41,25.953810,-80.205651,25.942621,-80.205285,0.773,Miami,Miami-Dade,FL,...,False,False,False,Day,Day,Day,Day,Monday,16,7
2774571,2,2021-12-16 21:54:00,38.426953,-120.588087,38.426690,-120.587706,0.027,Pioneer,Amador,CA,...,False,False,False,Night,Night,Night,Night,Thursday,21,12
2774572,2,2021-12-12 16:12:00,39.750573,-84.191238,39.749632,-84.190979,0.066,Dayton,Montgomery,OH,...,False,False,False,Day,Day,Day,Day,Sunday,16,12
2774573,2,2021-05-01 11:28:00,39.754339,-84.192260,39.753289,-84.191977,0.074,Dayton,Montgomery,OH,...,False,False,False,Day,Day,Day,Day,Saturday,11,5


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774575 entries, 0 to 2774574
Data columns (total 39 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Severity               int64  
 1   Start_Time             object 
 2   Start_Lat              float64
 3   Start_Lng              float64
 4   End_Lat                float64
 5   End_Lng                float64
 6   Distance(mi)           float64
 7   City                   object 
 8   County                 object 
 9   State                  object 
 10  Temperature(F)         float64
 11  Wind_Chill(F)          float64
 12  Humidity(%)            float64
 13  Pressure(in)           float64
 14  Visibility(mi)         float64
 15  Wind_Direction         object 
 16  Wind_Speed(mph)        float64
 17  Precipitation(in)      float64
 18  Weather_Condition      object 
 19  Amenity                bool   
 20  Bump                   bool   
 21  Crossing               bool   
 22  Give_Way          

In [86]:
df = df[['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 
       'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 
       'Traffic_Signal', 'Turning_Loop']]


In [87]:
df

Unnamed: 0,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774570,False,False,False,False,False,False,False,False,True,False,False,False,False
2774571,False,False,False,False,False,False,False,False,False,False,False,False,False
2774572,False,False,False,False,False,False,False,False,False,False,False,False,False
2774573,False,False,False,False,False,False,False,False,False,False,False,False,False


In [88]:
df

Unnamed: 0,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774570,False,False,False,False,False,False,False,False,True,False,False,False,False
2774571,False,False,False,False,False,False,False,False,False,False,False,False,False
2774572,False,False,False,False,False,False,False,False,False,False,False,False,False
2774573,False,False,False,False,False,False,False,False,False,False,False,False,False


In [100]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Convert boolean columns to integers
bool_columns = ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
                'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming',
                'Traffic_Signal', 'Turning_Loop']
df[bool_columns] = df[bool_columns].astype(int)


# Apply Apriori algorithm
frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Show frequent itemsets
print("Frequent Itemsets:")
print(frequent_itemsets)

# Show association rules
print("\nAssociation Rules:")
print(rules)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Frequent Itemsets:
    support                    itemsets
0  0.071072                  (Crossing)
1  0.102037                  (Junction)
2  0.024013                   (Station)
3  0.093829            (Traffic_Signal)
4  0.038292  (Crossing, Traffic_Signal)

Association Rules:
        antecedents       consequents  antecedent support  consequent support  \
0        (Crossing)  (Traffic_Signal)            0.071072            0.093829   
1  (Traffic_Signal)        (Crossing)            0.093829            0.071072   

    support  confidence      lift  leverage  conviction  zhangs_metric  
0  0.038292    0.538784  5.742183  0.031624    1.964743       0.889036  
1  0.038292    0.408107  5.742183  0.031624    1.569420       0.911363  


In [101]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.071072,(Crossing)
1,0.102037,(Junction)
2,0.024013,(Station)
3,0.093829,(Traffic_Signal)
4,0.038292,"(Crossing, Traffic_Signal)"


In [91]:
df

Unnamed: 0,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop
0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774570,0,0,0,0,0,0,0,0,1,0,0,0,0
2774571,0,0,0,0,0,0,0,0,0,0,0,0,0
2774572,0,0,0,0,0,0,0,0,0,0,0,0,0
2774573,0,0,0,0,0,0,0,0,0,0,0,0,0
