In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the CSV file
file_path = r'C:\Users\User\Documents\GitHub\Tech_Lab6\17072024_sales_data\Clean_Data.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Define features and target
features = ['Sales', 'Region']  # Example features
target = 'Ship_Mode'  # Example target

# Ensure the selected columns exist
if not all(col in df.columns for col in features + [target]):
    missing_cols = [col for col in features + [target] if col not in df.columns]
    print(f"Columns not found in the data: {missing_cols}")
else:
    # Drop rows with missing values in the selected columns
    df = df.dropna(subset=features + [target])

    # Encode the target variable if it's categorical
    if df[target].dtype == 'object':
        label_encoder = LabelEncoder()
        df[target] = label_encoder.fit_transform(df[target])

    # Define the preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), ['Sales']),  # Scale numerical features
            ('cat', OneHotEncoder(), ['Region'])   # One-hot encode categorical features
        ])

    # Create a pipeline with preprocessing and logistic regression
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])

    # Define features (X) and target (y)
    X = df[features]
    y = df[target]

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)




Accuracy: 0.59
Confusion Matrix:
[[   0    0    0  313]
 [   0    0    0  110]
 [   0    0    0  370]
 [   0    0    0 1165]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       313
           1       0.00      0.00      0.00       110
           2       0.00      0.00      0.00       370
           3       0.59      1.00      0.75      1165

    accuracy                           0.59      1958
   macro avg       0.15      0.25      0.19      1958
weighted avg       0.35      0.59      0.44      1958



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
