In [5]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Celeste\AppData\Roaming\nltk_data...


In [2]:
df = pd.read_excel(r"C:\Users\Celeste\OneDrive\Documents\Target_customer_dataV1.xlsx")
print(df.head())

  Description  Quantity       Date  UnitPrice  CustomerID  Total Amount  \
0         NaN         1 2023-12-26       50.0        25.0          50.0   
1         NaN         2 2023-03-22      500.0       161.0        1000.0   
2         NaN         1 2023-10-18       25.0       191.0          25.0   
3         NaN         3 2023-09-22       30.0       218.0          90.0   
4         NaN         1 2023-03-03      500.0       220.0         500.0   

   Gender  Age Product Category  
0  Female   64           Beauty  
1    Male   64           Beauty  
2    Male   64           Beauty  
3    Male   64           Beauty  
4    Male   64           Beauty  


#### Deriving a product category from descriptions

In [3]:
# Separate rows with and without a Product Category
df['Product Category'] = df['Product Category'].fillna('')  # Fill NaN with empty string for easier processing

# Replace NaN values with an empty string for vectorizer
df['Description'] = df['Description'].fillna('No description available')

# Identify rows where 'Product Category' is missing (empty string)
missing_data = df['Product Category'] == ''

# Data with known Product Categories (used for training)
train_data = df[~missing_data]

# Data with missing Product Categories (used for prediction)
predict_data = df[missing_data]

In [8]:
from nltk.stem import WordNetLemmatizer
import re

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabet characters
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatize words
    return text

# Apply the cleaning function to the 'Description' column
df['Description'] = df['Description'].apply(clean_text)


In [24]:
# Prepare the features (X) and target (y) for training
X_train = train_data['Description']  # Descriptions to train on
y_train = train_data['Product Category']  # Corresponding Product Categories

# Vectorize the descriptions
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=3)
X = vectorizer.fit_transform(df['Description'])
y = df['Product Category']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get the number of unique classes
n_classes = len(np.unique(y_train))

# Set uniform priors (each class gets an equal probability)
uniform_priors = np.full(n_classes, 1/n_classes)

# Initialize the model with uniform priors
model = MultinomialNB(class_prior=uniform_priors)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)


In [25]:
# Check accuracy
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 85.52%


In [26]:
# Predict the missing Product Categories
X_predict = predict_data['Description']  # Descriptions to predict
X_predict_tfidf = vectorizer.transform(X_predict)  # Apply the same vectorizer to the predict data

# Make predictions for the missing data
predicted_categories = model.predict(X_predict_tfidf)

In [27]:
# Update the 'Product Category' column with the predicted values
df.loc[missing_data, 'Product Category'] = predicted_categories

print(df[['Description', 'Product Category']].head(20))

                 Description Product Category
0   no description available           Beauty
1   no description available           Beauty
2   no description available           Beauty
3   no description available           Beauty
4   no description available           Beauty
5   no description available           Beauty
6   no description available           Beauty
7   no description available           Beauty
8   no description available           Beauty
9   no description available           Beauty
10  no description available         Clothing
11  no description available         Clothing
12  no description available         Clothing
13  no description available         Clothing
14  no description available         Clothing
15  no description available         Clothing
16  no description available         Clothing
17  no description available         Clothing
18  no description available         Clothing
19  no description available         Clothing


In [28]:
print(predicted_categories[:10])  # Check first 10 predictions

['Clothing' 'Clothing' 'Clothing' 'Clothing' 'Clothing' 'Clothing'
 'Clothing' 'Clothing' 'Clothing' 'Clothing']


#### Creating a new column and randomly assigning a US region

In [29]:
# List of US regions
us_regions = ['Northeast', 'Midwest', 'South', 'West']

# Randomly assign a region to each row in the DataFrame
df['US Region'] = np.random.choice(us_regions, size=len(df))

# Display the DataFrame
print(df)

                        Description  Quantity       Date  UnitPrice  \
0          no description available         1 2023-12-26      50.00   
1          no description available         2 2023-03-22     500.00   
2          no description available         1 2023-10-18      25.00   
3          no description available         3 2023-09-22      30.00   
4          no description available         1 2023-03-03     500.00   
...                             ...       ...        ...        ...   
13187        hand warmer owl design         1 2023-07-19       2.10   
13188        hand warmer union jack         2 2023-08-10       2.10   
13189  plaster in tin circus parade         1 2023-09-12       1.65   
13190          vintage snake ladder         2 2023-08-05       3.75   
13191         lunch bag suki design         1 2023-05-07       1.65   

       CustomerID  Total Amount  Gender  Age Product Category  US Region  
0            25.0         50.00  Female   64           Beauty    Midwest

In [31]:
# Save to CSV
df.to_csv('C:/Users/Celeste/Downloads/Target_Customer_Data.V5.csv', index=False)

In [13]:
print(y_train.value_counts())

Product Category
Clothing       351
Electronics    342
Beauty         307
Name: count, dtype: int64
