In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load the data
aisles_df = pd.read_csv('../../CSVs/aisles.csv')
departments_df = pd.read_csv('../../CSVs/departments.csv')
order_products_prior_df = pd.read_csv('../../CSVs/order_products__prior.csv')
order_products_train_df = pd.read_csv('../../CSVs/order_products__train.csv')
orders_df = pd.read_csv('../../CSVs/orders.csv')
products_df = pd.read_csv('../../CSVs/products.csv')
sample_submission_df = pd.read_csv('../../CSVs/sample_submission.csv')

# Initialize OneHotEncoder
encoder = OneHotEncoder()

# Feature extraction for aisles_df and departments_df
# One-hot encoding for aisles
aisle_encoded = encoder.fit_transform(aisles_df[['aisle']]).toarray()
aisle_feature_names = encoder.get_feature_names_out(['aisle'])
aisles_df_encoded = pd.DataFrame(aisle_encoded, columns=aisle_feature_names)

# One-hot encoding for departments
department_encoded = encoder.fit_transform(departments_df[['department']]).toarray()
department_feature_names = encoder.get_feature_names_out(['department'])
departments_df_encoded = pd.DataFrame(department_encoded, columns=department_feature_names)

# Join encoded aisles and departments to the products dataframe
# Assuming aisle_id and department_id are the correct columns in products_df
products_df = products_df.join(aisles_df_encoded.set_index(aisles_df['aisle_id']), on='aisle_id')
products_df = products_df.join(departments_df_encoded.set_index(departments_df['department_id']), on='department_id')

# Aggregation on order_products_prior and order_products_train for each product
# Assuming product_id is the correct column in these dataframes
# Calculate total orders and reorder frequency for each product
product_reorder_prior = order_products_prior_df.groupby('product_id')['reordered'].mean().to_frame('reorder_rate_prior')
product_reorder_train = order_products_train_df.groupby('product_id')['reordered'].mean().to_frame('reorder_rate_train')

# Merge the reorder rate back to the products dataframe
products_df = products_df.merge(product_reorder_prior, on='product_id', how='left')
products_df = products_df.merge(product_reorder_train, on='product_id', how='left')

# Fill NaN values for products that were never reordered with 0
products_df['reorder_rate_prior'].fillna(0, inplace=True)
products_df['reorder_rate_train'].fillna(0, inplace=True)

# Save the modified dataframes
aisles_df_encoded.to_csv('../../CSVs/modified_aisles.csv', index=False)
departments_df_encoded.to_csv('../../CSVs/modified_departments.csv', index=False)
products_df.to_csv('../../CSVs/modified_products.csv', index=False)

print("Feature extraction completed and data saved.")


Feature extraction completed and data saved.
