# Step.4 Feature Correlation:

**4.1 Import neccessary libraries** 

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

**4.2  Read the dataset** 

In [5]:
df = pd.read_csv("Feature.csv")
df.dtypes

#Exclude columns of type 'object'.
drop_colmns = ['payment_types', 'order_status', 'product_category_name']
df = df.drop(drop_colmns, axis=1)

#Convert the five-point scale to binary: 1 for high score group and 0 for low score group.
df['review_score'] = df['review_score'].apply(lambda x: 1 if x >= 4 else 0)

df.isnull().sum()

review_score                 0
is_review                    0
price                        0
freight_value                0
payment_value                0
product_photos_qty           0
approve_time                 0
seller_shipping_time         0
logistics_time               0
total_waiting_time           0
late_early_arrival_time      0
delivery_accuracy            0
sellers_count                0
products_in_order_count      0
total_orders_per_customer    0
total_orders_per_seller      0
purchase_month               0
is_weekend                   0
order_freight_ratio          0
on_time_delivery             0
distance_km                  0
same_state                   0
same_city                    0
dtype: int64

**4.3 Correlation between features and review scores, as well as inter-feature correlation** 

In [7]:
# Calculate the correlation matrix
corr_matrix = df.corr()
np.fill_diagonal(corr_matrix.values, np.nan)
# Get the correlation coefficients for the target variable 'review_score'
review_corr = corr_matrix['review_score']

**4.3.1 Positive correlations**

In [9]:
print("Variables with positive correlation:")
print(review_corr[review_corr > 0].sort_values(ascending=False))

Variables with positive correlation:
on_time_delivery             0.322206
late_early_arrival_time      0.237680
same_state                   0.056598
purchase_month               0.028902
same_city                    0.025441
product_photos_qty           0.013694
order_freight_ratio          0.012324
total_orders_per_customer    0.003239
price                        0.002554
Name: review_score, dtype: float64


**4.3.2 Negative correlations**

In [11]:
print("Variables with negative correlation:")
print(review_corr[review_corr < 0].sort_values())

Variables with negative correlation:
total_waiting_time        -0.296597
delivery_accuracy         -0.289144
logistics_time            -0.265425
is_review                 -0.240210
distance_km               -0.049640
total_orders_per_seller   -0.032055
freight_value             -0.031648
payment_value             -0.030855
seller_shipping_time      -0.023694
approve_time              -0.021186
products_in_order_count   -0.010225
sellers_count             -0.004336
is_weekend                -0.002735
Name: review_score, dtype: float64


**4.3.3 Select the top 5 variables with the strongest correlations (absolute values):**

In [13]:
top_features = review_corr.abs().sort_values(ascending=False).head(11)
print("Top variables with the strongest correlations (absolute values):")
print(top_features)

Top variables with the strongest correlations (absolute values):
on_time_delivery           0.322206
total_waiting_time         0.296597
delivery_accuracy          0.289144
logistics_time             0.265425
is_review                  0.240210
late_early_arrival_time    0.237680
same_state                 0.056598
distance_km                0.049640
total_orders_per_seller    0.032055
freight_value              0.031648
payment_value              0.030855
Name: review_score, dtype: float64


**4.3.4 Identify highly correlated features (correlation greater than 0.9 or less than -0.9).**

In [18]:
highly_correlated_features = corr_matrix[(corr_matrix > 0.9) | (corr_matrix < -0.9)].stack().index.tolist()
print("Highly correlated features:")
print(highly_correlated_features)

Highly correlated features:
[('price', 'payment_value'), ('payment_value', 'price')]
