In [13]:
import numpy as np
import pandas as pd

In [14]:
data=pd.read_csv("extended_based_on_chemicals9.csv") #first we had use 7
data.head()
data.shape

(336, 2)

In [15]:
data= data.sample(frac=1, random_state=42).reset_index(drop=True)
data.head()

Unnamed: 0,ingredients,category
0,"Enzymes""",cleaning
1,"Butane""",household
2,"Indapamide""",medicine
3,"Polystyrene""",stationery
4,"Odor Eliminators""",household


In [16]:
data.isnull().sum()

ingredients    0
category       0
dtype: int64

In [17]:
data.shape
data=data.dropna()

In [18]:
data.drop_duplicates()
data.shape
cleaned=data.drop_duplicates()

In [19]:
cleaned.shape

(330, 2)

In [20]:
data['category'] = data['category'].str.replace('"', '')

In [21]:
print(data['category'].value_counts())


category
personal_care    47
cosmetic         45
food             45
cleaning         41
medicine         40
stationery       40
others           40
household        38
Name: count, dtype: int64


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [23]:
X = data["ingredients"]
y = data["category"]

In [24]:
tfid=TfidfVectorizer(
    ngram_range=(1, 3),
    stop_words='english',
    max_features=15000,
    min_df=2,
    max_df=0.9
)
X_vec= tfid.fit_transform(X)



In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, stratify=y,random_state=42)

**Logistic reg**

In [26]:
clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [27]:
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

               precision    recall  f1-score   support

     cleaning       0.00      0.00      0.00         8
     cosmetic       0.33      0.11      0.17         9
         food       0.17      0.11      0.13         9
    household       0.25      0.25      0.25         8
     medicine       0.25      1.00      0.40         8
       others       0.25      0.12      0.17         8
personal_care       0.29      0.20      0.24        10
   stationery       0.50      0.25      0.33         8

     accuracy                           0.25        68
    macro avg       0.25      0.26      0.21        68
 weighted avg       0.26      0.25      0.21        68



In [28]:
#logistic accuracy :25 #54

**random forest**

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
y_pred=rf_classifier.predict(X_test)
print(classification_report(y_test,y_pred))

               precision    recall  f1-score   support

     cleaning       0.00      0.00      0.00         8
     cosmetic       0.33      0.11      0.17         9
         food       0.17      0.11      0.13         9
    household       0.29      0.25      0.27         8
     medicine       0.25      1.00      0.40         8
       others       0.25      0.12      0.17         8
personal_care       0.29      0.20      0.24        10
   stationery       0.50      0.25      0.33         8

     accuracy                           0.25        68
    macro avg       0.26      0.26      0.21        68
 weighted avg       0.26      0.25      0.21        68



In [32]:
#random accuracy:25 #54

**Xg boost**

In [33]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [34]:
X_train,X_test,y_train,y_test=train_test_split(X_vec, y_encoded, random_state=42, test_size=0.2)

In [35]:
xgb=xgb.XGBClassifier(objective="multi:softmax",random_state=42)
xgb.fit(X_train,y_train)

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [36]:
y_pred = xgb.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.20      0.14      0.17         7
           2       1.00      0.08      0.15        12
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         9
           5       0.00      0.00      0.00         8
           6       0.07      0.80      0.12         5
           7       0.00      0.00      0.00         9

    accuracy                           0.09        68
   macro avg       0.16      0.13      0.06        68
weighted avg       0.20      0.09      0.05        68



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.08823529411764706

In [38]:
#Xgboost: 40

**Svm**

In [39]:
from sklearn.svm import SVC

In [40]:
svm = SVC(random_state=42)

In [41]:
svm.fit(X_train,y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [42]:
y_pred=svm.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.40      0.29      0.33         7
           2       0.33      0.17      0.22        12
           3       0.00      0.00      0.00         7
           4       0.26      0.89      0.40         9
           5       0.33      0.12      0.18         8
           6       0.06      0.20      0.09         5
           7       0.50      0.11      0.18         9

    accuracy                           0.22        68
   macro avg       0.24      0.22      0.18        68
weighted avg       0.24      0.22      0.18        68



In [43]:
#svm accuracy: 45

In [44]:
from scipy.stats import randint

In [45]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'bootstrap': [True, False]
}

In [46]:
rf = RandomForestClassifier()

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings sampled
    cv=5,  # Number of cross-validation folds
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit the random search model
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'bootstrap': [True, False], 'max_depth': <scipy.stats....001879E30DA10>, 'min_samples_leaf': <scipy.stats....001879E3B0CD0>, 'min_samples_split': <scipy.stats....001879E3A96D0>, ...}"
,n_iter,100
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,n_estimators,70
,criterion,'gini'
,max_depth,3
,min_samples_split,8
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [47]:

# Print the best parameters found
print("Best parameters found: ", random_search.best_params_)

# Evaluate the model on the test set
test_accuracy = random_search.score(X_test, y_test)
print("Test set accuracy: ", test_accuracy)

Best parameters found:  {'bootstrap': False, 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 70}
Test set accuracy:  0.07352941176470588


In [48]:
# Create a Random Forest classifier with the best parameters
best_rf = RandomForestClassifier(bootstrap= True, max_depth= 19,min_samples_leaf= 2,min_samples_split= 11,n_estimators= 94, random_state=42)

# Train the model using the training sets
best_rf.fit(X_train, y_train)

# Evaluate the model on the test set
test_accuracy = best_rf.score(X_test, y_test)
print("Test set accuracy with best parameters: ", test_accuracy)

Test set accuracy with best parameters:  0.1323529411764706


In [49]:
from joblib import dump
dump(best_rf,'Tuned_model.joblib_recent')
dump(tfid, "tfidf_vectorizer.joblib_recent")

['tfidf_vectorizer.joblib_recent']

In [50]:
import joblib
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']