In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [6]:
# Read the dataset

# Load data (ensure to replace the path securely or use environment variables)
#data = pd.read_csv(r'C:\Users\jpscw\Documents\EDA Anahy\full_data_best_secret.csv')

# Prompt the user to input the file path for the CSV file
#file_path = input("Please enter the path to the CSV file: ")

# Step 2: Load the CSV into a DataFrame
#try:
   # data = pd.read_csv(file_path)
   # print("CSV file loaded successfully.")
#except Exception as e:
    #print(f"Error loading CSV file: {e}")
    #exit()  # Exit if the file cannot be loaded

# Get unique values from 'product_navision_detail_category'
unique_categories = data['product_navision_detail_category'].unique()

# Create the data_navision dictionary with the unique categories
data_navision = {
    'product_navision_detail_category': unique_categories
}

# Create DataFrame from data_navision (optional step)
df = pd.DataFrame(data_navision)

# Rule-based label assignment for obvious categories
def label_soft_hard(category):
    if pd.isna(category):
        return "unknown"  # Handling NaN cases
    
    soft_keywords = [
        "hose", "hemd", "kleid", "jacke", "rock", "bluse", "shirt", "anzug", "schuhe", 
        "socke", "mütze", 'abendkleid', 'bluse', 'boxershort', 't-shirt kurz', 'pullover', 
        'mütze', 'schal', 'socken', 'strickjacke', 'trainingshose', 'daunenmantel', 'top', 
        'business hemd l.a uni', 'bermuda/short', 'bikejacke', 'cocktailkleid', 'jeansjacke', 
        'unisex hausschuhe', 'boxershort 5p', 'partyhemd', 'kniestrumpf', 'BH', 'unterhose', 
        'kopfkissen', 'hundkissen', 'hoody', 'unterwäsche', 'kniestrümpfe'
    ]
    
    # Check if any keyword exists in the category
    if any(keyword in category.lower() for keyword in soft_keywords):
        return "soft"
    return "hard"

# Apply the rule-based labeling to the dataframe
df['label'] = df['product_navision_detail_category'].apply(label_soft_hard)

# Prepare labeled and unlabeled datasets for training
labeled_df = df[df['label'] == "soft"].copy()  # Use .copy() to avoid SettingWithCopyWarning
unlabeled_df = df[df['label'] == "hard"].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Encode labels: 1 for soft, 0 for hard
labeled_df.loc[:, 'label'] = 1  # Use .loc to assign safely
unlabeled_df.loc[:, 'label'] = 0  # Use .loc to assign safely

# Combine both DataFrames for classification purposes
combined_df = pd.concat([labeled_df[['product_navision_detail_category', 'label']], 
                         unlabeled_df[['product_navision_detail_category', 'label']]], 
                         ignore_index=True)


# Remove any rows with NaN in 'product_navision_detail_category'
combined_df = combined_df.dropna(subset=['product_navision_detail_category'])

# Prepare data for training
X = combined_df['product_navision_detail_category']
# Convert labels to integer type explicitly
y = combined_df['label'].astype(int)

# Check for any NaN values in X or y before training
if X.isna().any() or y.isna().any():
    raise ValueError("X or y contains NaN values. Please check the data.")

# Train the model
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X, y)


# Create and train the Naive Bayes model
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X, y)

# Ensure no NaN values before predicting
df = df.dropna(subset=['product_navision_detail_category'])

# Predict labels for the entire dataset (including those not labeled by rules)
df['final_label'] = model.predict(df['product_navision_detail_category'])

# Rename 'final_label' to something more meaningful, like 'soft_hard_label'
df.rename(columns={'final_label': 'soft_hard_label'}, inplace=True)

# Convert the result into a dictionary of volumes
category_labels_dict = dict(zip(df['product_navision_detail_category'], df['soft_hard_label']))



In [8]:
###IN CASE YOU PREFER TO WRITE WITH A JSON FILE
#import json
#with open('labels_dict.json', 'w') as f:
    #json.dump(category_labels_dict, f)

In [10]:
# Write the dictionary to a .py file with UTF-8 encoding
with open('label_dict.py', 'w', encoding='utf-8') as f:
    f.write('label_dict = ' + str(category_labels_dict))