In [None]:
import joblib
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


# Load the Products Dataset
products_df = pd.read_csv('olist_products_dataset.csv')

#load the Review and Seller Dataset
reviews_df = pd.read_csv('olist_order_reviews_dataset.csv')
sellers_df = pd.read_csv('olist_sellers_dataset.csv')

# Load the Product Category Name Translation Dataset
category_translation_df = pd.read_csv('product_category_name_translation.csv')

# Merge the Products Dataset with the Category Translation Dataset on product category name
merged_products_df = pd.merge(products_df, category_translation_df, on='product_category_name', how='left')

# Load the Orders Dataset
orders_df = pd.read_csv('olist_orders_dataset.csv')

# Load the Order Items Dataset
order_items_df = pd.read_csv('olist_order_items_dataset.csv')

# Load the Order Payments Dataset
order_payments_df = pd.read_csv('olist_order_payments_dataset.csv')

# Merge the Order Items Dataset with the Orders Dataset on order ID
merged_order_info_df = pd.merge(order_items_df, orders_df, on='order_id', how='left')

# Merge the merged_order_info_df with the Order Payments Dataset on order ID
merged_order_info_df = pd.merge(merged_order_info_df, order_payments_df, on='order_id', how='left')

# Merge the merged_order_info_df with the merged_products_df on product ID
merged_df = pd.merge(merged_order_info_df, merged_products_df, on='product_id', how='left')
merged_df = pd.merge(merged_df, sellers_df[['seller_id', 'seller_state']], on='seller_id', how='left')
merged_df = pd.merge(merged_df, reviews_df[['order_id', 'review_score']], on='order_id', how='left')

# Convert 'order_purchase_timestamp' to datetime format
merged_df['order_purchase_timestamp'] = pd.to_datetime(merged_df['order_purchase_timestamp'])

# Define Black Friday date range for 2016 and 2017
start_date_2016 = '2016-11-18'  # Start date of the week before Black Friday 2016
end_date_2016 = '2016-11-25'    # End date of Black Friday 2016
start_date_2017 = '2017-11-17'  # Start date of the week before Black Friday 2017
end_date_2017 = '2017-11-24'    # End date of Black Friday 2017

# Filter transactions for Black Friday deals and the week before Black Friday for both 2016 and 2017
black_friday_deals_within_range_df = merged_df[((merged_df['order_purchase_timestamp'].dt.date >= pd.to_datetime(start_date_2016).date()) & 
                                                (merged_df['order_purchase_timestamp'].dt.date <= pd.to_datetime(end_date_2016).date())) |
                                               ((merged_df['order_purchase_timestamp'].dt.date >= pd.to_datetime(start_date_2017).date()) & 
                                                (merged_df['order_purchase_timestamp'].dt.date <= pd.to_datetime(end_date_2017).date()))]

# Display first few rows of the filtered dataframe
print("Filtered Black Friday Deals Dataset within the specified range:")
print(black_friday_deals_within_range_df.head())


In [None]:
# Rename the filtered Black Friday deals within range dataframe as final_df
final_df = black_friday_deals_within_range_df

# Data Cleaning and Preprocessing

# Handle missing values
final_df.dropna(inplace=True)

# Assuming 'order_purchase_timestamp' is a column in your DataFrame
final_df['order_purchase_timestamp'] = pd.to_datetime(final_df['order_purchase_timestamp'])

# Extract year, month, day, hour, and minute components
final_df['purchase_year'] = final_df['order_purchase_timestamp'].dt.year
final_df['purchase_month'] = final_df['order_purchase_timestamp'].dt.month
final_df['purchase_day'] = final_df['order_purchase_timestamp'].dt.day
final_df['purchase_hour'] = final_df['order_purchase_timestamp'].dt.hour
final_df['purchase_minute'] = final_df['order_purchase_timestamp'].dt.minute

# Selecting top 10 variables for predicting the payment value
selected_columns = [
    'product_category_name_english',
    'purchase_year',
    'purchase_month',
    'purchase_day',
    'review_score',
    'seller_state'
]
# Create feature matrix (X) and target variable (y) using the selected columns
X = final_df[selected_columns]
y = final_df['price']

# Further preprocessing steps (handling missing values, encoding categorical variables, etc.) can be performed here
# Define a label encoder for the categorical feature
label_encoder = LabelEncoder()

# Encode categorical features
label_encoder = LabelEncoder()
X['product_category_encoded'] = label_encoder.fit_transform(X['product_category_name_english'])
X['seller_state_encoded'] = label_encoder.fit_transform(X['seller_state'])

# Drop original categorical features
X.drop(['product_category_name_english', 'seller_state'], axis=1, inplace=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model development

model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate the model's performance on the testing dataset
from sklearn.metrics import mean_absolute_error, mean_squared_error
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)


In [None]:
# Save the trained model
joblib.dump(model, 'trained_model.pkl')