In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer

# Load the CSV file
file_path = r'C:\Users\User\Documents\GitHub\Tech_Lab6\17072024_sales_data\Clean_Data.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Convert Ship_Date to datetime and extract features
df['Ship_Date'] = pd.to_datetime(df['Ship_Date'], errors='coerce')
df['Ship_Year'] = df['Ship_Date'].dt.year
df['Ship_Month'] = df['Ship_Date'].dt.month
df['Ship_Day'] = df['Ship_Date'].dt.day
df['Ship_Weekday'] = df['Ship_Date'].dt.weekday

# Drop the original Ship_Date column
df = df.drop(columns=['Ship_Date'])

# Updated list of features
features = ['Ship_Year', 'Ship_Month', 'Ship_Day', 'Ship_Weekday', 'Ship_Mode', 'Customer_ID', 'Segment', 'City', 'State', 'Region', 'Postal_Code', 'Product_ID', 'Category', 'Sub_Category']
target = 'Sales'

# Discretize Sales into categories (for logistic regression)
df['Sales_Category'] = pd.qcut(df[target], q=4, labels=False)

# Drop rows with missing values in the selected columns
df = df.dropna(subset=features + ['Sales_Category'])

# Define preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), ['Ship_Year', 'Ship_Month', 'Ship_Day', 'Ship_Weekday', 'Postal_Code']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Ship_Mode', 'Customer_ID', 'Segment', 'City', 'State', 'Region', 'Product_ID', 'Category', 'Sub_Category'])
    ])

# Create a pipeline with preprocessing and logistic regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define features (X) and target (y)
X = df[features]
y = df['Sales_Category']  # Use the discretized Sales category

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Accuracy: 0.40
Confusion Matrix:
[[296 140  10  58]
 [184 143  44 102]
 [131 132  43 191]
 [ 58  71  58 297]]
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.59      0.50       504
           1       0.29      0.30      0.30       473
           2       0.28      0.09      0.13       497
           3       0.46      0.61      0.52       484

    accuracy                           0.40      1958
   macro avg       0.37      0.40      0.36      1958
weighted avg       0.37      0.40      0.37      1958



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
