In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load and clean the data
df = pd.read_excel('Online Retail.xlsx')
df.columns = df.columns.str.strip()
df.dropna(subset=['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country'], inplace=True)

# Create Revenue column
df['Revenue'] = df['Quantity'] * df['UnitPrice']

# Step 1: Group revenue by InvoiceNo
invoice_revenue = df.groupby('InvoiceNo')['Revenue'].sum().reset_index()

# Step 2: Create HighValue label
threshold = 500  # You can adjust this number if you want
invoice_revenue['HighValue'] = (invoice_revenue['Revenue'] > threshold).astype(int)

# Step 3: Bring in Country information
invoice_country = df[['InvoiceNo', 'Country']].drop_duplicates()
invoice_data = pd.merge(invoice_revenue, invoice_country, on='InvoiceNo')

# Step 4: One-hot encode country
invoice_data = pd.get_dummies(invoice_data, columns=['Country'], drop_first=True)

# Step 5: Define features and target
X = invoice_data.drop(['InvoiceNo', 'Revenue', 'HighValue'], axis=1)
y = invoice_data['HighValue']

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8019378098242451

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.99      0.89      3565
           1       0.42      0.02      0.03       873

    accuracy                           0.80      4438
   macro avg       0.61      0.51      0.46      4438
weighted avg       0.73      0.80      0.72      4438

