# Decision Trees and Random Forests 

# Task 
Objective: Use decision trees or random forests to classify sales as highly profitable based on selected features.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/Data-Navigators/Statistical_Concept_Excercise/main/data/Retail_sales_dataset.csv")

In [4]:
df['Date'] = pd.to_datetime(df['Date'])

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())


# Define target variable: HighlyProfitable (1 if Total Amount > threshold, else 0)
threshold = df['Total Amount'].median()  # or choose a custom value
df['HighlyProfitable'] = (df['Total Amount'] > threshold).astype(int)

# Rename 'Product Category' to 'Product_Category'
df = df.rename(columns={'Product Category': 'Product_Category'})

# Re-apply one-hot encoding, ensuring all categories are represented
df = pd.get_dummies(df, columns=['Gender', 'Product_Category'], drop_first=False, dtype=int)


Missing values:
Transaction ID      0
Date                0
Customer ID         0
Gender              0
Age                 0
Product Category    0
Quantity            0
Price per Unit      0
Total Amount        0
dtype: int64


In [6]:
# Define features and target variable
X = df[['Age', 'Quantity', 'Price per Unit', 'Gender_Male', 'Product_Category_Clothing', 'Product_Category_Electronics', 'Product_Category_Beauty']]
y = df['HighlyProfitable']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the decision tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict with decision tree
y_pred_dt = dt_model.predict(X_test)

# Create and fit the random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict with random forest
y_pred_rf = rf_model.predict(X_test)

# Evaluate decision tree model
print("Decision Tree Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

# Evaluate random forest model
print("\nRandom Forest Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Feature importance from Random Forest
feature_importances = rf_model.feature_importances_
for name, importance in zip(X.columns, feature_importances):
    print(f"{name}: {importance:.4f}")


Decision Tree Model Evaluation:
Accuracy: 1.0000
[[ 90   0]
 [  0 110]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        90
           1       1.00      1.00      1.00       110

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Random Forest Model Evaluation:
Accuracy: 1.0000
[[ 90   0]
 [  0 110]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        90
           1       1.00      1.00      1.00       110

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Age: 0.0528
Quantity: 0.1025
Price per Unit: 0.8312
Gender_Male: 0.0051
Product_Category_Clothing: 0.0027
Product_Category_Electronics: 0.0031
Product_Category_Beauty: 0.0026
