In [1]:
import pandas as pd

# File paths
input_file = 'mooc_dataset/crop_yield.csv'  # Path to the original CSV file
output_file = 'mooc_dataset/output_truncated.csv'  # Path to save the truncated CSV file

# Load the CSV file
df = pd.read_csv(input_file)

# Truncate to the first 100,000 rows
df_truncated = df.head(10000)

# Save the truncated file
df_truncated.to_csv(output_file, index=False)

print(f"Truncated CSV file saved to {output_file}")

Truncated CSV file saved to mooc_dataset/output_truncated.csv


In [2]:
import pandas as pd

data = pd.read_csv('mooc_dataset/crop_yield.csv')

data = data.head(500000)
# Binarize numerical columns into categories
data['Rainfall_Category'] = pd.cut(
    data['Rainfall_mm'], bins=[0, 500, 1000, float('inf')], labels=['Low Rainfall', 'Moderate Rainfall', 'High Rainfall']
)
data['Temperature_Category'] = pd.cut(
    data['Temperature_Celsius'], bins=[-float('inf'), 20, 30, float('inf')], labels=['Low Temp', 'Moderate Temp', 'High Temp']
)
data['Harvest_Time_Category'] = pd.cut(
    data['Days_to_Harvest'], bins=[0, 100, 150, float('inf')], labels=['Short Harvest', 'Medium Harvest', 'Long Harvest']
)
data['Yield_Category'] = pd.cut(data['Yield_tons_per_hectare'], bins=[0, 2, 5, 8, 15], labels=['Low', 'Medium', 'High', 'Very High'], include_lowest=True)
# Drop original numerical columns as they are no longer needed for transactional encoding
data_preprocessed = data.drop(columns=['Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest', 'Yield_tons_per_hectare'])

# Display the first few rows of the preprocessed dataset
data_preprocessed.head()


Unnamed: 0,Region,Soil_Type,Crop,Fertilizer_Used,Irrigation_Used,Weather_Condition,Rainfall_Category,Temperature_Category,Harvest_Time_Category,Yield_Category
0,West,Sandy,Cotton,False,True,Cloudy,Moderate Rainfall,Moderate Temp,Medium Harvest,High
1,South,Clay,Rice,True,True,Rainy,Moderate Rainfall,Low Temp,Medium Harvest,Very High
2,North,Loam,Barley,False,False,Sunny,Low Rainfall,Moderate Temp,Medium Harvest,Low
3,North,Sandy,Soybean,False,True,Rainy,Moderate Rainfall,Low Temp,Medium Harvest,High
4,South,Silt,Wheat,True,True,Cloudy,Moderate Rainfall,High Temp,Medium Harvest,High


In [3]:
# Select categorical columns for one-hot encoding
categorical_columns = ['Region', 'Soil_Type', 'Crop', 'Fertilizer_Used', 'Irrigation_Used', 'Weather_Condition', 'Rainfall_Category', 'Temperature_Category', 'Yield_Category']

# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(data[categorical_columns], drop_first=False)

# Display the first few rows of the one-hot encoded dataset
one_hot_encoded.head()

Unnamed: 0,Fertilizer_Used,Irrigation_Used,Region_East,Region_North,Region_South,Region_West,Soil_Type_Chalky,Soil_Type_Clay,Soil_Type_Loam,Soil_Type_Peaty,...,Rainfall_Category_Low Rainfall,Rainfall_Category_Moderate Rainfall,Rainfall_Category_High Rainfall,Temperature_Category_Low Temp,Temperature_Category_Moderate Temp,Temperature_Category_High Temp,Yield_Category_Low,Yield_Category_Medium,Yield_Category_High,Yield_Category_Very High
0,False,True,False,False,False,True,False,False,False,False,...,False,True,False,False,True,False,False,False,True,False
1,True,True,False,False,True,False,False,True,False,False,...,False,True,False,True,False,False,False,False,False,True
2,False,False,False,True,False,False,False,False,True,False,...,True,False,False,False,True,False,True,False,False,False
3,False,True,False,True,False,False,False,False,False,False,...,False,True,False,True,False,False,False,False,True,False
4,True,True,False,False,True,False,False,False,False,False,...,False,True,False,False,False,True,False,False,True,False


In [4]:
# Convert the one-hot encoded DataFrame into a binary transactional format
transactions = one_hot_encoded.applymap(lambda x: 1 if x else 0)

# Display the first few rows of the transactional dataset
transactions.head()

  transactions = one_hot_encoded.applymap(lambda x: 1 if x else 0)


Unnamed: 0,Fertilizer_Used,Irrigation_Used,Region_East,Region_North,Region_South,Region_West,Soil_Type_Chalky,Soil_Type_Clay,Soil_Type_Loam,Soil_Type_Peaty,...,Rainfall_Category_Low Rainfall,Rainfall_Category_Moderate Rainfall,Rainfall_Category_High Rainfall,Temperature_Category_Low Temp,Temperature_Category_Moderate Temp,Temperature_Category_High Temp,Yield_Category_Low,Yield_Category_Medium,Yield_Category_High,Yield_Category_Very High
0,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
1,1,1,0,0,1,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,1,0,...,1,0,0,0,1,0,1,0,0,0
3,0,1,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
4,1,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [5]:
transactions.shape

(500000, 31)

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

# Step 1: Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(transactions, min_support=0.0675, use_colnames=True)

frequent_itemsets



In [None]:
# Step 2: Extract association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8, num_itemsets=len(frequent_itemsets))

# Display the association rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Yield_Category_High),(Rainfall_Category_Moderate Rainfall),0.40344,0.555344,0.356188,0.882877,1.589784,1.0,0.13214,3.796496,0.621872,0.591089,0.736599,0.76213
1,"(Fertilizer_Used, Yield_Category_Medium)",(Rainfall_Category_Low Rainfall),0.2029,0.444656,0.173552,0.855357,1.923638,1.0,0.083331,3.83942,0.602373,0.36614,0.739544,0.622832
2,"(Fertilizer_Used, Yield_Category_High)",(Rainfall_Category_Moderate Rainfall),0.273718,0.555344,0.227348,0.830592,1.495635,1.0,0.07534,2.624765,0.45628,0.377834,0.619014,0.619987
3,"(Fertilizer_Used, Rainfall_Category_Moderate R...",(Yield_Category_High),0.276824,0.40344,0.227348,0.821273,2.035675,1.0,0.115666,3.337823,0.703511,0.501965,0.700404,0.692398
4,"(Yield_Category_High, Irrigation_Used)",(Rainfall_Category_Moderate Rainfall),0.257628,0.555344,0.213522,0.8288,1.492408,1.0,0.07045,2.597285,0.444443,0.356197,0.614983,0.606643
5,"(Yield_Category_High, Region_East)",(Rainfall_Category_Moderate Rainfall),0.100682,0.555344,0.088596,0.879959,1.584529,1.0,0.032683,3.704191,0.410197,0.156136,0.730036,0.519746
6,"(Yield_Category_High, Region_North)",(Rainfall_Category_Moderate Rainfall),0.10146,0.555344,0.089556,0.882673,1.589417,1.0,0.033211,3.789886,0.412712,0.157878,0.73614,0.521968
7,"(Region_South, Yield_Category_High)",(Rainfall_Category_Moderate Rainfall),0.100102,0.555344,0.088594,0.885037,1.593674,1.0,0.033003,3.867827,0.413957,0.156291,0.741457,0.522284
8,"(Yield_Category_High, Region_West)",(Rainfall_Category_Moderate Rainfall),0.101196,0.555344,0.089442,0.883849,1.591535,1.0,0.033243,3.828263,0.413522,0.157719,0.738785,0.522453
9,"(Weather_Condition_Cloudy, Yield_Category_High)",(Rainfall_Category_Moderate Rainfall),0.13432,0.555344,0.118828,0.884663,1.593001,1.0,0.044234,3.855293,0.430013,0.208165,0.740616,0.549318


In [24]:
def get_union_of_matching_consequents(antecedents, rules_df, top_n=5):
    """
    Returns a union of unique consequent items from the rules that best match the given antecedents.

    Parameters:
    - antecedents (list): A list of antecedents to match.
    - rules_df (pd.DataFrame): Dataframe containing the rules.
    - top_n (int): Number of top rules to consider based on confidence.

    Returns:
    - list: Union of unique consequent items.
    """
    # Convert the input antecedents to a frozenset
    antecedents_set = frozenset(antecedents)

    # Filter rules where the antecedents in the rule are a superset of the input antecedents
    rules_df['antecedents_set'] = rules_df['antecedents'].apply(lambda x: frozenset(eval(x)))
    matched_rules = rules_df[rules_df['antecedents_set'].apply(lambda x: antecedents_set.issubset(x))]

    # If no matches, return an empty list
    if matched_rules.empty:
        return []

    # Drop duplicate consequents, keeping the highest confidence for each unique consequent
    matched_rules['consequents_set'] = matched_rules['consequents'].apply(lambda x: frozenset(eval(x)))
    unique_rules = matched_rules.sort_values(by='confidence', ascending=False).drop_duplicates(subset='consequents_set')

    # Truncate to the top N rules by confidence
    top_rules = unique_rules.head(top_n)

    # Perform a union of all unique consequent sets
    consequents_union = set()
    for consequents_set in top_rules['consequents_set']:
        consequents_union.update(consequents_set)

    return list(consequents_union)


In [25]:
rules = pd.read_csv('rules/haithem_rules.csv')
# Test the function with sample antecedents
sample_antecedents = ['Soil_Type_Peaty', 'Yield_Category_Very High']
unique_consequents = get_union_of_matching_consequents(sample_antecedents, rules)

# Show the results to the user
unique_consequents


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_rules['consequents_set'] = matched_rules['consequents'].apply(lambda x: frozenset(eval(x)))


['Rainfall_Category_Very High', 'Fertilizer_Used', 'Irrigation_Used']