In [10]:
import pandas as pd
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# Function to preprocess data
def preprocess_data(data):
    data['Purchase Order Text'] = data['Purchase Order Text'].fillna(data['Order Description'])
    data['Purchase Order Text'] = data['Purchase Order Text'].astype(str)
    data['Order Description'] = data['Order Description'].astype(str)
    data['combined_text'] = data['Purchase Order Text'] + ' ' + data['Order Description']
    data['cleaned_combined_text'] = data['combined_text'].apply(lambda x: re.sub(r'\W', ' ', x).lower())
    return data

# Load and preprocess the first dataset
print("Loading and preprocessing the first dataset...")
data1 = pd.read_csv(r'C:\Users\irt\Downloads\all.csv', low_memory=False)
data1.dropna(subset=['Purchase Order Text', 'Category', 'Sub Category'], inplace=True)
data1 = preprocess_data(data1)

# Load and preprocess the second dataset
print("Loading and preprocessing the second dataset...")
data2 = pd.read_csv(r'C:\Users\Public\Downloads\TestingData.csv', low_memory=False)
data2.dropna(subset=['Purchase Order Text', 'Category', 'Sub Category'], inplace=True)
data2 = preprocess_data(data2)

# Combine both datasets
print("Combining both datasets...")
combined_data = pd.concat([data1, data2])

# Vectorize the combined cleaned text data
print("Vectorizing the combined cleaned text data...")
vectorizer = TfidfVectorizer(max_features=5000)
X_combined = vectorizer.fit_transform(combined_data['cleaned_combined_text'])

# Assuming 'Category' and 'Sub Category' are the columns to be predicted
y_combined_category = combined_data['Category']
y_combined_subcategory = combined_data['Sub Category']

# Encode the target variables
print("Encoding the target variables...")
label_encoder_cat = LabelEncoder()
label_encoder_subcat = LabelEncoder()
y_combined_category_encoded = label_encoder_cat.fit_transform(y_combined_category)
y_combined_subcategory_encoded = label_encoder_subcat.fit_transform(y_combined_subcategory)

# Save the label encoders
print("Saving the label encoders...")
joblib.dump(label_encoder_cat, 'label_encoder_cat.pkl')
joblib.dump(label_encoder_subcat, 'label_encoder_subcat.pkl')

# Check for classes with only one instance
subcategory_counts = Counter(y_combined_subcategory_encoded)
rare_subcategories = [subcat for subcat, count in subcategory_counts.items() if count == 1]

# Remove instances with rare subcategories
if rare_subcategories:
    mask = ~combined_data['Sub Category'].isin(label_encoder_subcat.inverse_transform(rare_subcategories))
    combined_data = combined_data[mask]
    
    # Update X_combined and target variables
    X_combined = vectorizer.fit_transform(combined_data['cleaned_combined_text'])
    y_combined_category_encoded = label_encoder_cat.transform(combined_data['Category'])
    y_combined_subcategory_encoded = label_encoder_subcat.transform(combined_data['Sub Category'])

# Stratified sampling to get 50% of the data
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(
    X_combined, y_combined_category_encoded, test_size=0.2, random_state=42, stratify=y_combined_category_encoded)
X_train_subcat, X_test_subcat, y_train_subcat, y_test_subcat = train_test_split(
    X_combined, y_combined_subcategory_encoded, test_size=0.2, random_state=42, stratify=y_combined_subcategory_encoded)

# Initialize and train Random Forest for Category
print("Initializing and training Random Forest for Category...")
rf_cat = RandomForestClassifier(random_state=42)
rf_cat.fit(X_train_cat, y_train_cat)
print(f"Test Accuracy on Combined Dataset (Category): {rf_cat.score(X_test_cat, y_test_cat)}")

# Initialize and train Random Forest for Sub Category
print("Initializing and training Random Forest for Sub Category...")
rf_subcat = RandomForestClassifier(random_state=42)
rf_subcat.fit(X_train_subcat, y_train_subcat)
print(f"Test Accuracy on Combined Dataset (Sub Category): {rf_subcat.score(X_test_subcat, y_test_subcat)}")

# Save the models and vectorizer
print("Saving the models and vectorizer...")
joblib.dump(rf_cat, 'model_cat_rf.pkl')
joblib.dump(rf_subcat, 'model_subcat_rf.pkl')
joblib.dump(vectorizer, 'vectorizer_rf.pkl')

print("Models and vectorizer saved successfully.")


Loading and preprocessing the first dataset...
Loading and preprocessing the second dataset...
Combining both datasets...
Vectorizing the combined cleaned text data...
Encoding the target variables...
Saving the label encoders...
Initializing and training Random Forest for Category...
Test Accuracy on Combined Dataset (Category): 0.929384317742119
Initializing and training Random Forest for Sub Category...
Test Accuracy on Combined Dataset (Sub Category): 0.9670536019119154
Saving the models and vectorizer...
Models and vectorizer saved successfully.


In [20]:
import pandas as pd
import re
import joblib
from sklearn.metrics import accuracy_score
import numpy as np

# Function to preprocess data
def preprocess_data(data):
    data['Purchase Order Text'] = data['Purchase Order Text'].fillna(data['Order Description'])
    data['Purchase Order Text'] = data['Purchase Order Text'].astype(str)
    data['Order Description'] = data['Order Description'].astype(str)
    data['combined_text'] = data['Purchase Order Text'] + ' ' + data['Order Description']
    data['cleaned_combined_text'] = data['combined_text'].apply(lambda x: re.sub(r'\W', ' ', x).lower())
    return data

# Function to load the CSV file with different encodings
def load_csv_file(file_path):
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding, low_memory=False)
        except UnicodeDecodeError:
            continue
    raise ValueError("Unable to read the CSV file with the specified encodings.")

# Load the saved models and vectorizer
vectorizer = joblib.load('vectorizer_rf.pkl')
model_cat = joblib.load('model_cat_rf.pkl')
model_subcat = joblib.load('model_subcat_rf.pkl')
label_encoder_cat = joblib.load('label_encoder_cat.pkl')
label_encoder_subcat = joblib.load('label_encoder_subcat.pkl')

# Define the path to your CSV file
input_file_path = r'C:\Users\Public\Downloads\finallynew.csv'  # Update this to the actual path of your input CSV file

# Load the new input CSV file
input_data = load_csv_file(input_file_path)

# Preprocess the input data
preprocessed_input_data = preprocess_data(input_data)

# Vectorize the cleaned combined text data
vectorized_input_data = vectorizer.transform(preprocessed_input_data['cleaned_combined_text'])

# Make predictions for each row
category_predictions = model_cat.predict(vectorized_input_data)
subcategory_predictions = model_subcat.predict(vectorized_input_data)

# Decode the predictions
decoded_category_predictions = label_encoder_cat.inverse_transform(category_predictions)
decoded_subcategory_predictions = label_encoder_subcat.inverse_transform(subcategory_predictions)

# Display the predictions
input_data['Predicted Category'] = decoded_category_predictions
input_data['Predicted Sub Category'] = decoded_subcategory_predictions

# Handle unseen labels for subcategories by mapping them to a default value or ignoring them
unseen_labels_subcat = set(input_data['Sub Category']) - set(label_encoder_subcat.classes_)
if unseen_labels_subcat:
    print(f"Warning: The following subcategories were unseen and are being mapped to 'unknown': {unseen_labels_subcat}")
    input_data['Sub Category'] = input_data['Sub Category'].apply(lambda x: 'unknown' if x in unseen_labels_subcat else x)

# If actual labels are available in the input data, create match columns
if 'Category' in input_data.columns and 'Sub Category' in input_data.columns:
    input_data['Category Match'] = np.where(input_data['Category'] == input_data['Predicted Category'], 'Yes', 'No')
    input_data['Subcategory Match'] = np.where(input_data['Sub Category'] == input_data['Predicted Sub Category'], 'Yes', 'No')

    # Calculate accuracy if actual labels are present
    try:
        # Filter out rows with 'unknown' labels before calculating accuracy
        filtered_data = input_data[(input_data['Sub Category'] != 'unknown') & (input_data['Predicted Sub Category'] != 'unknown')]

        if not filtered_data.empty:
            actual_category = label_encoder_cat.transform(filtered_data['Category'])
            actual_subcategory = label_encoder_subcat.transform(filtered_data['Sub Category'])

            filtered_category_predictions = label_encoder_cat.transform(filtered_data['Predicted Category'])
            filtered_subcategory_predictions = label_encoder_subcat.transform(filtered_data['Predicted Sub Category'])

            category_accuracy = accuracy_score(actual_category, filtered_category_predictions)
            subcategory_accuracy = accuracy_score(actual_subcategory, filtered_subcategory_predictions)

            print(f'Category Prediction Accuracy: {category_accuracy * 100:.2f}%')
            print(f'Subcategory Prediction Accuracy: {subcategory_accuracy * 100:.2f}%')
        else:
            print("No valid data available for accuracy calculation after filtering out 'unknown' labels.")

    except ValueError as e:
        print(f"Error in accuracy calculation: {e}")

else:
    print("Columns 'Category' and 'Sub Category' not found in input_data.")

# Save the prediction results to a new CSV file
output_file_path = r'C:\Users\Public\Downloads\randomforestresult.csv'
input_data.to_csv(output_file_path, index=False)
print(f"Predictions saved to {output_file_path}")

# Display the predictions
print(input_data[['Purchase Order Text', 'Order Description', 'Category', 'Predicted Category', 'Category Match', 'Sub Category', 'Predicted Sub Category', 'Subcategory Match']])


Category Prediction Accuracy: 89.96%
Subcategory Prediction Accuracy: 93.61%
Predictions saved to C:\Users\Public\Downloads\randomforestresult.csv
                            Purchase Order Text  \
0                                           nan   
1           CHGS.FOR FAB'S WORK JOB IN T2 06/23   
2              CHGS.FOR FAB'S WORK JOB IN ELECT   
3           CHGS.FOR THERMAL INSU WORK JOB T1 M   
4                                           nan   
...                                         ...   
19995                    FOR PROVISION FY 23-24   
19996                                             
19997                                             
19998                                         0   
19999                   FABRICATION WORK INV.99   

                               Order Description        Category  \
0                                            nan       R&M - P&M   
1            MPP-2 PLANT GENERAL MAINTENANCE JOB  Civil Expenses   
2       MCT-1921G Fan & fan hub asse