In [50]:
import re
import pickle
import pandas as pd
from nltk.corpus import stopwords

In [66]:
## Preprocessing Setup 

# Define stopwords (English, French, and custom)
stop_words_eng = set(stopwords.words('english'))
stop_words_fr = set(stopwords.words('french'))
custom_stopwords = set(["chez", "der", "plu", "haut", "peut", "non", "100", "produit", "lot", "tout", "cet", "cest", "sou", "san"])
stop_words = stop_words_eng.union(stop_words_fr).union(custom_stopwords)

def preprocess_text_cleaning_only(text, vectorizer):
    """
    Preprocess text: lowercasing, cleaning, and removing stopwords.
    """
    # Step 1: Lowercase and remove special characters
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    
    # Step 2: Remove stopwords directly from the text
    cleaned_text = ' '.join(word for word in cleaned_text.split() if word not in stop_words)
    
    # Step 3: Transform using the pre-trained TF-IDF vectorizer
    return vectorizer.transform([cleaned_text])

# Load model
def load_model(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

# File paths for saved models and vectorizer
text_vectorizer_path = "~/tfidf_vectorizer.pkl"
text_model_path = "~/sgd_text_model.pkl"

# Load models and vectorizer
tfidf_vectorizer = load_model(text_vectorizer_path)
text_model = load_model(text_model_path)

# Random Row Prediction
def predict_random_row(df):
    """
    Randomly select a row, preprocess the text, and predict the product type.
    """
    # Randomly select a row from the DataFrame
    random_row = df.sample(n=1).iloc[0]
    
    # Extract the text to predict
    designation = random_row['designation']
    description = random_row['description'] if pd.notnull(random_row['description']) else ""
    text = f"{designation} {description}"
    
    # Preprocess and vectorize the text
    text_vectorized = preprocess_text_cleaning_only(text, tfidf_vectorizer)
    
    # Predict the product type
    prediction = text_model.predict(text_vectorized)[0]
    
    # Output the result
    print("Random Row Selected:")
    print(f"Designation: {designation}")
    print(f"Description: {description}")
    print(f"Predicted Product Type: {prediction}")

# Example
if __name__ == "__main__":
    # Load test data directly from a DataFrame
    test_data_path = "~/X_test_update.csv"  ´
    test_df = pd.read_csv(test_data_path)
    
    # Predict a random row
    predict_random_row(test_df)


Random Row Selected:
Designation: Lot De 10 Ballons - Latex - Chiffre 8 - Amscan
Description: Lot de 10 Ballons - Latex - Chiffre 8<br />Lot de 10 Ballons - Latex - Chiffre 8
Predicted Product Type: 2060
