In [11]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Load dataset with low_memory=False to avoid mixed types warning
data = pd.read_csv(r'C:\Users\irt\Downloads\all.csv', low_memory=False)

# Fill missing values in 'Purchase Order Text' with 'Order Description'
data['Purchase Order Text'] = data['Purchase Order Text'].fillna(data['Order Description'])

# Drop rows with missing values in 'Purchase Order Text', 'Category', or 'Sub Category'
data.dropna(subset=['Purchase Order Text', 'Category', 'Sub Category'], inplace=True)

# Convert all entries in the 'Purchase Order Text' and 'Order Description' columns to strings
data['Purchase Order Text'] = data['Purchase Order Text'].astype(str)
data['Order Description'] = data['Order Description'].astype(str)

# Clean the text data
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lower case
    return text

# Concatenate 'Purchase Order Text' and 'Order Description'
data['combined_text'] = data['Purchase Order Text'] + ' ' + data['Order Description']
data['cleaned_combined_text'] = data['combined_text'].apply(clean_text)

# Vectorize the combined cleaned text data
vectorizer = TfidfVectorizer(max_features=5000)
X_combined = vectorizer.fit_transform(data['cleaned_combined_text'])

# Assuming 'Category' and 'Sub Category' are the columns to be predicted
y_category = data['Category']
y_subcategory = data['Sub Category']

# Split the data for category and subcategory
X_train_cat, X_test_cat, y_category_train, y_category_test = train_test_split(X_combined, y_category, test_size=0.2, random_state=42)
X_train_sub, X_test_sub, y_subcategory_train, y_subcategory_test = train_test_split(X_combined, y_subcategory, test_size=0.2, random_state=42)

# Train the model for Category
model_category = RandomForestClassifier()
model_category.fit(X_train_cat, y_category_train)

# Train the model for SubCategory
model_subcategory = RandomForestClassifier()
model_subcategory.fit(X_train_sub, y_subcategory_train)

# Evaluate the models using cross-validation
category_scores = cross_val_score(model_category, X_combined, y_category, cv=5)
subcategory_scores = cross_val_score(model_subcategory, X_combined, y_subcategory, cv=5)

print(f"Category Model Accuracy: {category_scores.mean()}")
print(f"SubCategory Model Accuracy: {subcategory_scores.mean()}")

# # Predict categories and subcategories on the same dataset
# data['predicted_category'] = model_category.predict(X_combined)
# data['predicted_subcategory'] = model_subcategory.predict(X_combined)

# # Show the results
# print(data[['Purchase Order Text', 'predicted_category', 'predicted_subcategory']])

Category Model Accuracy: 0.9074003187818963
SubCategory Model Accuracy: 0.9209532212770943


In [9]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Load the new data
new_data = pd.read_csv(r'C:\Users\Public\Downloads\TestingData.csv', low_memory=False)

# Fill missing values in 'Purchase Order Text' with 'Order Description'
new_data['Purchase Order Text'] = new_data['Purchase Order Text'].fillna(new_data['Order Description'])

# Convert all entries in the 'Purchase Order Text' and 'Order Description' columns to strings
new_data['Purchase Order Text'] = new_data['Purchase Order Text'].astype(str)
new_data['Order Description'] = new_data['Order Description'].astype(str)

# Clean the text data
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lower case
    return text

# Concatenate 'Purchase Order Text' and 'Order Description'
new_data['combined_text'] = new_data['Purchase Order Text'] + ' ' + new_data['Order Description']
new_data['cleaned_combined_text'] = new_data['combined_text'].apply(clean_text)



# Vectorize the cleaned text data using the previously fitted vectorizer
X_new_combined = vectorizer.transform(new_data['cleaned_combined_text'])

# Predict categories and subcategories using the trained models
new_data['predicted_category'] = model_category.predict(X_new_combined)
new_data['predicted_subcategory'] = model_subcategory.predict(X_new_combined)

# Show the results
print(new_data[['Purchase Order Text', 'predicted_category', 'predicted_subcategory']])

                            Purchase Order Text predicted_category  \
0      Refurbishment of GEA TRV-501 10 Ltr Bowl          R&M - P&M   
1                                                        R&M - P&M   
2                                                        R&M - P&M   
3                                                        R&M - P&M   
4                                                        R&M - P&M   
...                                         ...                ...   
56064                                         0    Stores & Spares   
56065                                         0    Stores & Spares   
56066                                         0          R&M - P&M   
56067                                         0    Stores & Spares   
56068                                         0    Stores & Spares   

      predicted_subcategory  
0                 Other R&M  
1                    Ignore  
2                    Ignore  
3                    Ignore  
4        