In [4]:
import pandas as pd

# Mock Sales Dataset
data = {
    "ID": [1, 2, 3, 4, 5, 6],
    "Salesperson": ["Alice", "Bob", "Charlie", "Diana", "Eva", "Frank"],
    "Region": ["West", "East", "South", "North", "East", "West"],
    "Product": ["Electronics", "Furniture", "Electronics", "Clothing", "Clothing", "Electronics"],
    "Units Sold": [50, 30, None, 100, None, 80],
    "Revenue ($)": [25000, 20000, 15000, 5000, None, 40000],
    "Target ($)": [30000, 20000, 20000, 10000, 10000, 50000],
    "Year": [2021, 2021, 2021, 2021, 2021, 2021],
}

# Create DataFrame
df = pd.DataFrame(data)

# Save as CSV
df.to_csv("sales_data.csv", index=False)

print("Mock dataset saved as 'sales_data.csv'.")

Mock dataset saved as 'sales_data.csv'.


In [2]:
df

Unnamed: 0,ID,Salesperson,Region,Product,Units Sold,Revenue ($),Target ($),Year
0,1,Alice,West,Electronics,50.0,25000.0,30000,2021
1,2,Bob,East,Furniture,30.0,20000.0,20000,2021
2,3,Charlie,South,Electronics,,15000.0,20000,2021
3,4,Diana,North,Clothing,100.0,5000.0,10000,2021
4,5,Eva,East,Clothing,,,10000,2021
5,6,Frank,West,Electronics,80.0,40000.0,50000,2021


# Step 1: Import Libraries


In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


# Step 2: Load the Dataset


In [6]:
data = pd.read_csv('sales_data.csv')


In [7]:
data

Unnamed: 0,ID,Salesperson,Region,Product,Units Sold,Revenue ($),Target ($),Year
0,1,Alice,West,Electronics,50.0,25000.0,30000,2021
1,2,Bob,East,Furniture,30.0,20000.0,20000,2021
2,3,Charlie,South,Electronics,,15000.0,20000,2021
3,4,Diana,North,Clothing,100.0,5000.0,10000,2021
4,5,Eva,East,Clothing,,,10000,2021
5,6,Frank,West,Electronics,80.0,40000.0,50000,2021


In [32]:
print("Original Dataset:")
print(data)

Original Dataset:
   ID Salesperson Region      Product  Units Sold  Revenue ($)  Target ($)  \
0   1       Alice   West  Electronics        50.0      25000.0       30000   
1   2         Bob   East    Furniture        30.0      20000.0       20000   
2   3     Charlie  South  Electronics         NaN      15000.0       20000   
3   4       Diana  North     Clothing       100.0       5000.0       10000   
4   5         Eva   East     Clothing         NaN          NaN       10000   
5   6       Frank   West  Electronics        80.0      40000.0       50000   

   Year  
0  2021  
1  2021  
2  2021  
3  2021  
4  2021  
5  2021  


# Step 3: Handle Missing Values
#Impute missing values in "Units Sold" and "Revenue ($)" with the mean

In [8]:
num_imputer = SimpleImputer(strategy='mean')
data[['Units Sold', 'Revenue ($)']] = num_imputer.fit_transform(data[['Units Sold', 'Revenue ($)']])


# Step 4: Encode Categorical Features
# Use one-hot encoding for 'Region' and 'Product'

In [34]:
from sklearn.preprocessing import OneHotEncoder

# Use OneHotEncoder for 'Region' and 'Product'
categorical_features = ['Region', 'Product']
encoder = OneHotEncoder(sparse=False, drop='first')

# Fit and transform the categorical columns


In [35]:
encoded_cats = pd.DataFrame(
    encoder.fit_transform(data[categorical_features]), 
    columns=encoder.get_feature_names(categorical_features)
)

In [36]:
# Drop original categorical columns and concatenate the encoded columns
data = data.drop(columns=categorical_features)
data = pd.concat([data, encoded_cats], axis=1)

# Step 5: Feature Scaling
# Standardize numerical features

In [37]:
scaler = StandardScaler()
data[['Units Sold', 'Revenue ($)', 'Target ($)']] = scaler.fit_transform(data[['Units Sold', 'Revenue ($)', 'Target ($)']])


# Step 6: Drop Irrelevant Columns (e.g., "ID", "Year")


In [38]:
data = data.drop(columns=['ID', 'Year'])


In [39]:
print("\nPreprocessed Dataset:")
print(data)


Preprocessed Dataset:
  Salesperson  Units Sold  Revenue ($)  Target ($)  Region_North  \
0       Alice   -0.682288     0.378528    0.485071           0.0   
1         Bob   -1.592006    -0.094632   -0.242536           0.0   
2     Charlie    0.000000    -0.567792   -0.242536           0.0   
3       Diana    1.592006    -1.514113   -0.970143           1.0   
4         Eva    0.000000     0.000000   -0.970143           0.0   
5       Frank    0.682288     1.798009    1.940285           0.0   

   Region_South  Region_West  Product_Electronics  Product_Furniture  
0           0.0          1.0                  1.0                0.0  
1           0.0          0.0                  0.0                1.0  
2           1.0          0.0                  1.0                0.0  
3           0.0          0.0                  0.0                0.0  
4           0.0          0.0                  0.0                0.0  
5           0.0          1.0                  1.0                0.0  


# Step 7: Splitting the Dataset
#Assuming "Revenue ($)" as the target for this example

In [40]:
X = data.drop(columns=['Revenue ($)'])
y = data['Revenue ($)']


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [42]:
print("\nShapes After Splitting:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")



Shapes After Splitting:
X_train: (4, 8), X_test: (2, 8), y_train: (4,), y_test: (2,)


# Save the preprocessed data


In [27]:
X_train.to_csv('X_train_sales.csv', index=False)
X_test.to_csv('X_test_sales.csv', index=False)
y_train.to_csv('y_train_sales.csv', index=False)
y_test.to_csv('y_test_sales.csv', index=False)

print("\nPreprocessing Complete! Files saved.")


Preprocessing Complete! Files saved.


# Create a dataset that contains various types of data including numercial, categorical, etc. 
it should have some missing values. then preprocess it.

# the data can be about house prices, including:
saleprice, yearbuild, overall quality, size of the house,, size of garage, size of garagelike how many cars, housestyle, condition etc.