In [1]:
import pandas as pd
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# Function to preprocess data
def preprocess_data(data):
    data['Purchase Order Text'] = data['Purchase Order Text'].fillna(data['Order Description'])
    data['Purchase Order Text'] = data['Purchase Order Text'].astype(str)
    data['Order Description'] = data['Order Description'].astype(str)
    data['combined_text'] = data['Purchase Order Text'] + ' ' + data['Order Description']
    data['cleaned_combined_text'] = data['combined_text'].apply(lambda x: re.sub(r'\W', ' ', x).lower().strip())
    return data

# Function to load the CSV file with different encodings
def load_csv_file(file_path):
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding, low_memory=False)
        except UnicodeDecodeError:
            continue
    raise ValueError("Unable to read the CSV file with the specified encodings.")

# Function to load and preprocess the datasets
def load_and_preprocess(file_path):
    print(f"Loading and preprocessing {file_path}...")
    data = load_csv_file(file_path)
    data.dropna(subset=['Purchase Order Text', 'Category', 'Sub Category'], inplace=True)
    return preprocess_data(data)

# Function to train the models
def train_models(train_file_paths):
    # Load and preprocess the datasets
    data1 = load_and_preprocess(train_file_paths[0])
    data2 = load_and_preprocess(train_file_paths[1])

    # Combine both datasets
    print("Combining both datasets...")
    combined_data = pd.concat([data1, data2])

    # Vectorize the combined cleaned text data
    print("Vectorizing the combined cleaned text data...")
    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
    X_combined = vectorizer.fit_transform(combined_data['cleaned_combined_text'])

    # Encode the target variables
    print("Encoding the target variables...")
    label_encoder_cat = LabelEncoder()
    label_encoder_subcat = LabelEncoder()
    y_combined_category_encoded = label_encoder_cat.fit_transform(combined_data['Category'])
    y_combined_subcategory_encoded = label_encoder_subcat.fit_transform(combined_data['Sub Category'])

    # Save the label encoders
    print("Saving the label encoders...")
    joblib.dump(label_encoder_cat, 'label_encoder_cat.pkl')
    joblib.dump(label_encoder_subcat, 'label_encoder_subcat.pkl')

    # Handle rare subcategories
    subcategory_counts = Counter(y_combined_subcategory_encoded)
    rare_subcategories = [subcat for subcat, count in subcategory_counts.items() if count == 1]
    if rare_subcategories:
        mask = ~combined_data['Sub Category'].isin(label_encoder_subcat.inverse_transform(rare_subcategories))
        combined_data = combined_data[mask]
        X_combined = vectorizer.fit_transform(combined_data['cleaned_combined_text'])
        y_combined_category_encoded = label_encoder_cat.transform(combined_data['Category'])
        y_combined_subcategory_encoded = label_encoder_subcat.transform(combined_data['Sub Category'])

    # Stratified sampling to split the data
    X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(
        X_combined, y_combined_category_encoded, test_size=0.2, random_state=42, stratify=y_combined_category_encoded)
    X_train_subcat, X_test_subcat, y_train_subcat, y_test_subcat = train_test_split(
        X_combined, y_combined_subcategory_encoded, test_size=0.2, random_state=42, stratify=y_combined_subcategory_encoded)

    # Initialize and train Random Forest with GridSearchCV
    print("Initializing and training Random Forest for Category with GridSearchCV...")
    rf_cat = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }
    grid_cat = GridSearchCV(rf_cat, param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_cat.fit(X_train_cat, y_train_cat)
    print(f"Best parameters for category: {grid_cat.best_params_}")
    print(f"Test Accuracy on Combined Dataset (Category): {grid_cat.score(X_test_cat, y_test_cat)}")

    print("Initializing and training Random Forest for Sub Category with GridSearchCV...")
    rf_subcat = RandomForestClassifier(random_state=42)
    grid_subcat = GridSearchCV(rf_subcat, param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_subcat.fit(X_train_subcat, y_train_subcat)
    print(f"Best parameters for subcategory: {grid_subcat.best_params_}")
    print(f"Test Accuracy on Combined Dataset (Sub Category): {grid_subcat.score(X_test_subcat, y_test_subcat)}")

    # Save the models and vectorizer
    print("Saving the models and vectorizer...")
    joblib.dump(grid_cat.best_estimator_, 'model_cat_rf.pkl')
    joblib.dump(grid_subcat.best_estimator_, 'model_subcat_rf.pkl')
    joblib.dump(vectorizer, 'vectorizer_rf.pkl')

# Paths to your training datasets
train_file_paths = [
    r'C:\Users\irt\Downloads\all.csv',
    r'C:\Users\Public\Downloads\TestingData.csv'
]

# Run the training function
train_models(train_file_paths)


Loading and preprocessing C:\Users\irt\Downloads\all.csv...
Loading and preprocessing C:\Users\Public\Downloads\TestingData.csv...
Combining both datasets...
Vectorizing the combined cleaned text data...
Encoding the target variables...
Saving the label encoders...
Initializing and training Random Forest for Category with GridSearchCV...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters for category: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Test Accuracy on Combined Dataset (Category): 0.9281893706612041
Initializing and training Random Forest for Sub Category with GridSearchCV...
Fitting 3 folds for each of 12 candidates, totalling 36 fits




Best parameters for subcategory: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Test Accuracy on Combined Dataset (Sub Category): 0.965631045863207
Saving the models and vectorizer...


In [3]:
import pandas as pd
import re
import joblib
import numpy as np
from sklearn.metrics import accuracy_score

# Function to preprocess data
def preprocess_data(data):
    data['Purchase Order Text'] = data['Purchase Order Text'].fillna(data['Order Description'])
    data['Purchase Order Text'] = data['Purchase Order Text'].astype(str)
    data['Order Description'] = data['Order Description'].astype(str)
    data['combined_text'] = data['Purchase Order Text'] + ' ' + data['Order Description']
    data['cleaned_combined_text'] = data['combined_text'].apply(lambda x: re.sub(r'\W', ' ', x).lower().strip())
    return data

# Function to load the CSV file with different encodings
def load_csv_file(file_path):
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding, low_memory=False)
        except UnicodeDecodeError:
            continue
    raise ValueError("Unable to read the CSV file with the specified encodings.")

# Function to test and predict with the trained models
def test_and_predict(input_file_path):
    # Load and preprocess new input data for prediction
    input_data = load_csv_file(input_file_path)
    input_data = preprocess_data(input_data)

    # Load the saved models and vectorizer
    vectorizer = joblib.load('vectorizer_rf.pkl')
    model_cat = joblib.load('model_cat_rf.pkl')
    model_subcat = joblib.load('model_subcat_rf.pkl')
    label_encoder_cat = joblib.load('label_encoder_cat.pkl')
    label_encoder_subcat = joblib.load('label_encoder_subcat.pkl')

    # Vectorize and predict new input data
    vectorized_input_data = vectorizer.transform(input_data['cleaned_combined_text'])
    category_predictions = model_cat.predict(vectorized_input_data)
    subcategory_predictions = model_subcat.predict(vectorized_input_data)

    # Decode predictions
    input_data['Predicted Category'] = label_encoder_cat.inverse_transform(category_predictions)
    input_data['Predicted Sub Category'] = label_encoder_subcat.inverse_transform(subcategory_predictions)

    # Handle unseen labels
    unseen_labels_subcat = set(input_data['Sub Category']) - set(label_encoder_subcat.classes_)
    if unseen_labels_subcat:
        print(f"Warning: Unseen subcategories being mapped to 'unknown': {unseen_labels_subcat}")
        input_data['Sub Category'] = input_data['Sub Category'].apply(lambda x: 'unknown' if x in unseen_labels_subcat else x)

    # Check accuracy if actual labels exist
    if 'Category' in input_data.columns and 'Sub Category' in input_data.columns:
        input_data['Category Match'] = np.where(input_data['Category'] == input_data['Predicted Category'], 'Yes', 'No')
        input_data['Subcategory Match'] = np.where(input_data['Sub Category'] == input_data['Predicted Sub Category'], 'Yes', 'No')
        filtered_data = input_data[(input_data['Sub Category'] != 'unknown') & (input_data['Predicted Sub Category'] != 'unknown')]

        if not filtered_data.empty:
            actual_category = label_encoder_cat.transform(filtered_data['Category'])
            actual_subcategory = label_encoder_subcat.transform(filtered_data['Sub Category'])
            filtered_category_predictions = label_encoder_cat.transform(filtered_data['Predicted Category'])
            filtered_subcategory_predictions = label_encoder_subcat.transform(filtered_data['Predicted Sub Category'])

            category_accuracy = accuracy_score(actual_category, filtered_category_predictions)
            subcategory_accuracy = accuracy_score(actual_subcategory, filtered_subcategory_predictions)

            print(f'Category Prediction Accuracy: {category_accuracy * 100:.2f}%')
            print(f'Subcategory Prediction Accuracy: {subcategory_accuracy * 100:.2f}%')
        else:
            print("No valid data available for accuracy calculation after filtering out 'unknown' labels.")

    # Save results to CSV
    output_file_path = r'C:\Users\Public\Downloads\forestimporvedlatestresultlong.csv'
    input_data.to_csv(output_file_path, index=False)
    print(f"Predictions saved to {output_file_path}")

    print(input_data[['Purchase Order Text', 'Order Description', 'Category', 'Predicted Category', 'Category Match', 'Sub Category', 'Predicted Sub Category', 'Subcategory Match']])

# Path to your testing dataset
input_file_path = r'C:\Users\Public\Downloads\finallynew.csv'

# Run the testing and prediction function
test_and_predict(input_file_path)


Category Prediction Accuracy: 89.36%
Subcategory Prediction Accuracy: 92.81%
Predictions saved to C:\Users\Public\Downloads\forestimporvedlatestresultlong.csv
                            Purchase Order Text  \
0                                           nan   
1           CHGS.FOR FAB'S WORK JOB IN T2 06/23   
2              CHGS.FOR FAB'S WORK JOB IN ELECT   
3           CHGS.FOR THERMAL INSU WORK JOB T1 M   
4                                           nan   
...                                         ...   
19995                    FOR PROVISION FY 23-24   
19996                                             
19997                                             
19998                                         0   
19999                   FABRICATION WORK INV.99   

                               Order Description        Category  \
0                                            nan       R&M - P&M   
1            MPP-2 PLANT GENERAL MAINTENANCE JOB  Civil Expenses   
2       MCT-1921G Fan & 