In [11]:
pip install torch

Collecting torch
  Downloading torch-2.3.1-cp312-cp312-win_amd64.whl.metadata (26 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Using cached mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Using cached intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.2 kB)
Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Downloading tbb-2021.13.0-py3-none-win_amd64.whl.metadata (1.1 kB)
Downloading torch-2.3.1-cp312-cp312-win_amd64.whl (159.7 MB)
   ---------------------------------------- 0.0/159.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/159.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/159.7 MB 325.1 kB/s eta 0:08:12
   ---------------------------------------- 0.1/159.7 MB 465.5 kB/s eta 0:05:44
   ---------------------------------------- 0.1/159.7 MB 521.8 kB/s eta 0:05:06
   ---------------------------------------- 0.2/159.7 MB 701.4 kB

In [9]:
import pandas as pd
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Function to preprocess data
def preprocess_data(data):
    # Fill missing values in 'Purchase Order Text' with 'Order Description' and combine text fields
    data['Purchase Order Text'] = data['Purchase Order Text'].fillna(data['Order Description'])
    data['Purchase Order Text'] = data['Purchase Order Text'].astype(str)
    data['Order Description'] = data['Order Description'].astype(str)
    data['combined_text'] = data['Purchase Order Text'] + ' ' + data['Order Description']
    data['cleaned_combined_text'] = data['combined_text'].apply(lambda x: re.sub(r'\W', ' ', x).lower())
    return data

# Load and preprocess the first dataset
print("Loading and preprocessing the first dataset...")
data1 = pd.read_csv(r'C:\Users\irt\Downloads\all.csv', low_memory=False)
data1.dropna(subset=['Purchase Order Text', 'Category', 'Sub Category'], inplace=True)
data1 = preprocess_data(data1)

# Load and preprocess the second dataset
print("Loading and preprocessing the second dataset...")
data2 = pd.read_csv(r'C:\Users\Public\Downloads\TestingData.csv', low_memory=False)
data2.dropna(subset=['Purchase Order Text', 'Category', 'Sub Category'], inplace=True)
data2 = preprocess_data(data2)

# Combine both datasets
print("Combining both datasets...")
combined_data = pd.concat([data1, data2])

# Vectorize the combined cleaned text data
print("Vectorizing the combined cleaned text data...")
vectorizer = TfidfVectorizer(max_features=5000)
X_combined = vectorizer.fit_transform(combined_data['cleaned_combined_text'])

# Assuming 'Category' and 'Sub Category' are the columns to be predicted
y_combined_category = combined_data['Category']
y_combined_subcategory = combined_data['Sub Category']

# Encode the target variables
print("Encoding the target variables...")
label_encoder_cat = LabelEncoder()
label_encoder_subcat = LabelEncoder()
y_combined_category_encoded = label_encoder_cat.fit_transform(y_combined_category)
y_combined_subcategory_encoded = label_encoder_subcat.fit_transform(y_combined_subcategory)

# Save the label encoders
print("Saving the label encoders...")
joblib.dump(label_encoder_cat, 'label_encoder_cat.pkl')
joblib.dump(label_encoder_subcat, 'label_encoder_subcat.pkl')

# Split the data into 50% training and 50% testing
print("Splitting the data into 50% training and 50% testing...")
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_combined, y_combined_category_encoded, test_size=0.2, random_state=42)
X_train_subcat, X_test_subcat, y_train_subcat, y_test_subcat = train_test_split(X_combined, y_combined_subcategory_encoded, test_size=0.2, random_state=42)

# Initialize and train SVC for Category on 50% of the data
print("Initializing and training SVC for Category on 50% of the data...")
svc_cat = SVC(probability=True, random_state=42)
svc_cat.fit(X_train_cat, y_train_cat)
print(f"Test Accuracy on 50% of the Combined Dataset (Category): {svc_cat.score(X_test_cat, y_test_cat)}")

# Initialize and train SVC for Sub Category on 50% of the data
print("Initializing and training SVC for Sub Category on 50% of the data...")
svc_subcat = SVC(probability=True, random_state=42)
svc_subcat.fit(X_train_subcat, y_train_subcat)
print(f"Test Accuracy on 50% of the Combined Dataset (Sub Category): {svc_subcat.score(X_test_subcat, y_test_subcat)}")

# Save the models and vectorizer
print("Saving the models and vectorizer...")
joblib.dump(svc_cat, 'model_cat.pkl')
joblib.dump(svc_subcat, 'model_subcat.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("Models and vectorizer saved successfully.")


Loading and preprocessing the first dataset...
Loading and preprocessing the second dataset...
Combining both datasets...
Vectorizing the combined cleaned text data...
Encoding the target variables...
Saving the label encoders...
Splitting the data into 50% training and 50% testing...
Initializing and training SVC for Category on 50% of the data...
Test Accuracy on 50% of the Combined Dataset (Category): 0.9297826334357574
Initializing and training SVC for Sub Category on 50% of the data...
Test Accuracy on 50% of the Combined Dataset (Sub Category): 0.9667121884602253
Saving the models and vectorizer...
Models and vectorizer saved successfully.


In [9]:
import pandas as pd
import re
import joblib
from sklearn.metrics import accuracy_score

# Function to preprocess data
def preprocess_data(data):
    # Fill missing values in 'Purchase Order Text' with 'Order Description' and combine text fields
    data['Purchase Order Text'] = data['Purchase Order Text'].fillna(data['Order Description'])
    data['Purchase Order Text'] = data['Purchase Order Text'].astype(str)
    data['Order Description'] = data['Order Description'].astype(str)
    data['combined_text'] = data['Purchase Order Text'] + ' ' + data['Order Description']
    data['cleaned_combined_text'] = data['combined_text'].apply(lambda x: re.sub(r'\W', ' ', x).lower())
    return data

# Function to load the CSV file with different encodings
def load_csv_file(file_path):
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding, low_memory=False)
        except UnicodeDecodeError:
            continue
    raise ValueError("Unable to read the CSV file with the specified encodings.")

# Load the saved models and vectorizer
vectorizer = joblib.load('vectorizer.pkl')
model_cat = joblib.load('model_cat.pkl')
model_subcat = joblib.load('model_subcat.pkl')
label_encoder_cat = joblib.load('label_encoder_cat.pkl')
label_encoder_subcat = joblib.load('label_encoder_subcat.pkl')

# Define the path to your CSV file
input_file_path = r'C:\Users\Public\Downloads\finallynew.csv'  # Update this to the actual path of your input CSV file

# Load the new input CSV file
input_data = load_csv_file(input_file_path)

# Preprocess the input data
preprocessed_input_data = preprocess_data(input_data)

# Vectorize the cleaned combined text data
vectorized_input_data = vectorizer.transform(preprocessed_input_data['cleaned_combined_text'])

# Make predictions for each row
category_predictions = model_cat.predict(vectorized_input_data)
subcategory_predictions = model_subcat.predict(vectorized_input_data)

# Decode the predictions
decoded_category_predictions = label_encoder_cat.inverse_transform(category_predictions)
decoded_subcategory_predictions = label_encoder_subcat.inverse_transform(subcategory_predictions)

# Add predictions to input data
input_data['Predicted Category'] = decoded_category_predictions
input_data['Predicted Sub Category'] = decoded_subcategory_predictions

# Add match columns
input_data['Category Match'] = input_data.apply(lambda row: 'Yes' if row['Category'] == row['Predicted Category'] else 'No', axis=1)
input_data['Sub Category Match'] = input_data.apply(lambda row: 'Yes' if row['Sub Category'] == row['Predicted Sub Category'] else 'No', axis=1)

# Ensure the true and predicted labels are strings
input_data['Category'] = input_data['Category'].astype(str)
input_data['Sub Category'] = input_data['Sub Category'].astype(str)
decoded_category_predictions = decoded_category_predictions.astype(str)
decoded_subcategory_predictions = decoded_subcategory_predictions.astype(str)

# Save the predictions to a new CSV file
output_file_path = r'C:\Users\Public\Downloads\svcresult.csv'
input_data.to_csv(output_file_path, index=False)

# Calculate accuracy if actual labels are available
if 'Category' in input_data.columns and 'Sub Category' in input_data.columns:
    true_category_labels = input_data['Category']
    true_subcategory_labels = input_data['Sub Category']

    category_accuracy = accuracy_score(true_category_labels, decoded_category_predictions)
    subcategory_accuracy = accuracy_score(true_subcategory_labels, decoded_subcategory_predictions)

    print(f'Category Prediction Accuracy: {category_accuracy}')
    print(f'Sub Category Prediction Accuracy: {subcategory_accuracy}')
else:
    print("Actual label columns not found in the input data. Accuracy cannot be calculated.")

# Display the predictions
print(input_data[['Purchase Order Text', 'Order Description', 'Predicted Category', 'Predicted Sub Category', 'Category Match', 'Sub Category Match']])


Category Prediction Accuracy: 0.8988
Sub Category Prediction Accuracy: 0.89035
                            Purchase Order Text  \
0                                           nan   
1           CHGS.FOR FAB'S WORK JOB IN T2 06/23   
2              CHGS.FOR FAB'S WORK JOB IN ELECT   
3           CHGS.FOR THERMAL INSU WORK JOB T1 M   
4                                           nan   
...                                         ...   
19995                    FOR PROVISION FY 23-24   
19996                                             
19997                                             
19998                                         0   
19999                   FABRICATION WORK INV.99   

                               Order Description Predicted Category  \
0                                            nan          R&M - P&M   
1            MPP-2 PLANT GENERAL MAINTENANCE JOB     Civil Expenses   
2       MCT-1921G Fan & fan hub assembly to be r     Civil Expenses   
3                 Scaffh