# MNB, LR, SVM, NN, KNN, DT, RF with TF-IDF vectorization
<hr> 

#### <b> <i> Importing Libraries </b> </i>

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from joblib import dump, load

#### <b><i> Loading Dataset </b></i>

In [2]:
# Read CSV File
df = pd.read_csv("Chat_for_train.csv", encoding='latin-1')  
df.dropna(subset=['Message'], inplace=True)

text = df['Message'].tolist()
labels = df['Value'].tolist()

# <b><i> Model Training

## <b><i> Multinomial Naive Bayes Classifier

In [5]:


from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)
# Create a pipeline 
pipeline_mnb = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('mnb', MultinomialNB())  
])

# Train the pipeline
pipeline_mnb.fit(X_train, y_train)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)
# Create a pipeline 
pipeline_mnb1 = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('mnb', MultinomialNB(alpha = 0.5))  
])

# Train the pipeline
pipeline_mnb1.fit(X_train, y_train)

In [7]:
# Make predictions on the testing data
y_pred_nb = pipeline_mnb.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_nb)
print("Accuracy:", accuracy)

Accuracy: 0.6297936335533462


In [8]:
# Make predictions on the testing data
y_pred_nb1 = pipeline_mnb1.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_nb1)
print("Accuracy:", accuracy)

Accuracy: 0.6351407803492355


In [9]:
f1 = f1_score(y_test, y_pred_nb, average = 'weighted')
print("F1-score:", f1)

precision = precision_score(y_test, y_pred_nb, average = 'weighted')
print("Precision:", precision)

F1-score: 0.6141447358849744
Precision: 0.7095806625372284


In [10]:
f1 = f1_score(y_test, y_pred_nb1, average = 'weighted')
print("F1-score:", f1)

precision = precision_score(y_test, y_pred_nb1, average = 'weighted')
print("Precision:", precision)

F1-score: 0.6241319429015776
Precision: 0.6936277142453803


In [11]:
dump(pipeline_mnb, 'pipeline_mnb.joblib')

['pipeline_mnb.joblib']

In [12]:
dump(pipeline_mnb1, 'pipeline_mnb1.joblib')

['pipeline_mnb1.joblib']

In [13]:
def error_types(y_true, y_pred):
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for true_label, pred_label in zip(y_true, y_pred):
        if true_label == 1 and pred_label == 1:
            TP += 1
        elif true_label == 0 and pred_label == 0:
            TN += 1
        elif true_label == 0 and pred_label == 1:
            FP += 1
        elif true_label == 1 and pred_label == 0:
            FN += 1

    return TP, TN, FP, FN

# Assuming y_true and y_pred are lists or arrays of binary labels (0 or 1)
TP, TN, FP, FN = error_types(y_test, y_pred_nb)

print("True Positives:", TP)
print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)

True Positives: 1693
True Negatives: 4393
False Positives: 260
False Negatives: 1690


In [14]:
def error_types(y_true, y_pred):
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for true_label, pred_label in zip(y_true, y_pred):
        if true_label == 1 and pred_label == 1:
            TP += 1
        elif true_label == 0 and pred_label == 0:
            TN += 1
        elif true_label == 0 and pred_label == 1:
            FP += 1
        elif true_label == 1 and pred_label == 0:
            FN += 1

    return TP, TN, FP, FN

# Assuming y_true and y_pred are lists or arrays of binary labels (0 or 1)
TP, TN, FP, FN = error_types(y_test, y_pred_nb1)

print("True Positives:", TP)
print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)

True Positives: 1815
True Negatives: 4217
False Positives: 364
False Negatives: 1550


## <b><i> Logistic Regresion Classifier

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)
# Create a pipeline 
pipeline_lr = Pipeline([
    ('vect' , CountVectorizer()),
    ('tfidf', TfidfTransformer()),  
    ('lr', LogisticRegression(random_state = 10, class_weight='balanced', max_iter = 60000))  
])

# Train the pipeline
pipeline_lr.fit(X_train, y_train)

In [10]:
y_pred_lr = pipeline_lr.predict(X_test)
print("Accuracy Score = ", accuracy_score(y_test, y_pred_lr))

f1 = f1_score(y_test, y_pred_lr, average = 'weighted')
print("F1-score:", f1)


precision = precision_score(y_test, y_pred_lr, average = 'weighted')
print("Precision:", precision)

Accuracy Score =  0.7227003091319242
F1-score: 0.7209011853041709
Precision: 0.722162183923273


In [25]:
def error_types(y_true, y_pred):
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for true_label, pred_label in zip(y_true, y_pred):
        if true_label == 1 and pred_label == 1:
            TP += 1
        elif true_label == 0 and pred_label == 0:
            TN += 1
        elif true_label == 0 and pred_label == 1:
            FP += 1
        elif true_label == 1 and pred_label == 0:
            FN += 1

    return TP, TN, FP, FN

# Assuming y_true and y_pred are lists or arrays of binary labels (0 or 1)
TP, TN, FP, FN = error_types(y_test, y_pred_lr)

print("True Positives:", TP)
print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)

True Positives: 2157
True Negatives: 3597
False Positives: 681
False Negatives: 762


In [26]:
dump(pipeline_lr, 'logistic_reg.joblib')

['logistic_reg.joblib']

## Support Vector Classifier

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)
# Create a pipeline 
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('svc', SVC(kernel='rbf'))  
])

# Train the pipeline
pipeline_svm.fit(X_train, y_train)

In [28]:
# Predictions
svm_predictions_rbf = pipeline_svm.predict(X_test)

# Evaluate
svm_accuracy = accuracy_score(y_test, svm_predictions_rbf)
print("SVM Accuracy:", svm_accuracy)

SVM Accuracy: 0.7290500459520428


In [29]:
f1 = f1_score(y_test, svm_predictions_rbf, average = 'weighted')
print("F1-score:", f1)

precision = precision_score(y_test, svm_predictions_rbf, average = 'weighted')
print("Precision:", precision)

F1-score: 0.7245141110695589
Precision: 0.7333179374990964


In [30]:
def error_types(y_true, y_pred):
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for true_label, pred_label in zip(y_true, y_pred):
        if true_label == 1 and pred_label == 1:
            TP += 1
        elif true_label == 0 and pred_label == 0:
            TN += 1
        elif true_label == 0 and pred_label == 1:
            FP += 1
        elif true_label == 1 and pred_label == 0:
            FN += 1

    return TP, TN, FP, FN


TP, TN, FP, FN = error_types(y_test, svm_predictions_rbf)

print("True Positives:", TP)
print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)

True Positives: 1951
True Negatives: 3973
False Positives: 469
False Negatives: 993


In [33]:
dump(pipeline_svm, 'pipeline_svm.joblib')

['pipeline_svm.joblib']

## Random Forest Classifier

In [91]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)

pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('rf', RandomForestClassifier(n_estimators=300, n_jobs = -1, random_state = 20))  
])

# Train the pipeline
pipeline_rf.fit(X_train, y_train)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)

pipeline_rf1 = Pipeline([
    ('vect' , CountVectorizer()),
    ('tfidf', TfidfTransformer()), 
    ('rf', RandomForestClassifier(n_estimators=300, n_jobs = -1, random_state = 20))  
])

# Train the pipeline
pipeline_rf1.fit(X_train, y_train)

In [92]:
# Predictions
rf_predictions = pipeline_rf.predict(X_test)

# Evaluate
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)


Random Forest Accuracy: 0.6926226084050464


In [13]:
# Predictions
rf_predictions1 = pipeline_rf1.predict(X_test)

# Evaluate
rf_accuracy = accuracy_score(y_test, rf_predictions1)
print("Random Forest Accuracy:", rf_accuracy)


Random Forest Accuracy: 0.6926226084050464


In [36]:
   
f1 = f1_score(y_test, rf_predictions, average = 'weighted')
print("F1-score:", f1)

precision = precision_score(y_test, rf_predictions, average = 'weighted')
print("Precision:", precision)

F1-score: 0.6867337764136807
Precision: 0.6962545924645503


In [37]:
def error_types(y_true, y_pred):
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for true_label, pred_label in zip(y_true, y_pred):
        if true_label == 1 and pred_label == 1:
            TP += 1
        elif true_label == 0 and pred_label == 0:
            TN += 1
        elif true_label == 0 and pred_label == 1:
            FP += 1
        elif true_label == 1 and pred_label == 0:
            FN += 1

    return TP, TN, FP, FN

# Assuming y_true and y_pred are lists or arrays of binary labels (0 or 1)
TP, TN, FP, FN = error_types(y_test, rf_predictions)

print("True Positives:", TP)
print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)

True Positives: 1773
True Negatives: 3883
False Positives: 536
False Negatives: 1107


In [38]:
dump(pipeline_rf, 'pipeline_rf_692121.joblib')

['pipeline_rf_692121.joblib']

## KNN Classifier

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)

pipeline_knn = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('knn', KNeighborsClassifier(n_neighbors=10))  
])

# Train the pipeline
pipeline_knn.fit(X_train, y_train)

In [40]:
# Predictions
knn_predictions = pipeline_knn.predict(X_test)

# Evaluate
knn_accuracy = accuracy_score(y_test, knn_predictions)
print("K-Nearest Neighbors Accuracy:", knn_accuracy)

K-Nearest Neighbors Accuracy: 0.4324504971175537


In [41]:
f1 = f1_score(y_test, knn_predictions, average = 'weighted')
print("F1-score:", f1)

precision = precision_score(y_test, knn_predictions, average = 'weighted')
print("Precision:", precision)

F1-score: 0.36098946047411057
Precision: 0.6635339882453195


In [42]:
def error_types(y_true, y_pred):
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for true_label, pred_label in zip(y_true, y_pred):
        if true_label == 1 and pred_label == 1:
            TP += 1
        elif true_label == 0 and pred_label == 0:
            TN += 1
        elif true_label == 0 and pred_label == 1:
            FP += 1
        elif true_label == 1 and pred_label == 0:
            FN += 1

    return TP, TN, FP, FN

# Assuming y_true and y_pred are lists or arrays of binary labels (0 or 1)
TP, TN, FP, FN = error_types(y_test, knn_predictions)

print("True Positives:", TP)
print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)

True Positives: 1300
True Negatives: 324
False Positives: 105
False Negatives: 71


In [43]:
dump(pipeline_knn, 'pipeline_knn.joblib')

['pipeline_knn.joblib']

## Decision Tree Classifier

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)

pipeline_dt = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('dt', DecisionTreeClassifier())  
])

# Train the pipeline
pipeline_dt.fit(X_train, y_train)

In [45]:
# Predictions
dt_predictions = pipeline_dt.predict(X_test)

# Evaluate
dt_accuracy = accuracy_score(y_test, dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)


Decision Tree Accuracy: 0.6442476397359846


In [46]:

f1 = f1_score(y_test, dt_predictions, average = 'weighted')
print("F1-score:", f1)

precision = precision_score(y_test, dt_predictions, average = 'weighted')
print("Precision:", precision)

F1-score: 0.6412274118606193
Precision: 0.6426870888377597


In [47]:
def error_types(y_true, y_pred):
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for true_label, pred_label in zip(y_true, y_pred):
        if true_label == 1 and pred_label == 1:
            TP += 1
        elif true_label == 0 and pred_label == 0:
            TN += 1
        elif true_label == 0 and pred_label == 1:
            FP += 1
        elif true_label == 1 and pred_label == 0:
            FN += 1

    return TP, TN, FP, FN

# Assuming y_true and y_pred are lists or arrays of binary labels (0 or 1)
TP, TN, FP, FN = error_types(y_test, dt_predictions)

print("True Positives:", TP)
print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)

True Positives: 1805
True Negatives: 3218
False Positives: 846
False Negatives: 1005


In [48]:
dump(pipeline_dt, 'pipeline_dt.joblib')

['pipeline_dt.joblib']

## <b><i> NN MLPClassifier

In [49]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)
# Create a pipeline with preprocessing and MLPClassifier
pipeline_mlp = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('mlp', MLPClassifier(hidden_layer_sizes=(100,50), max_iter=1000))  
])

# Train the pipeline
pipeline_mlp.fit(X_train, y_train)

In [50]:
# Predictions
mlp_predictions = pipeline_mlp.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, mlp_predictions)
print("Accuracy:", accuracy)

Accuracy: 0.6910351742000167


In [51]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, mlp_predictions, average = 'weighted')
print("F1-score:", f1)

from sklearn.metrics import precision_score

precision = precision_score(y_test, mlp_predictions, average = 'weighted')
print("Precision:", precision)

F1-score: 0.6904852439394815
Precision: 0.6904151688148684


In [52]:
def error_types(y_true, y_pred):
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for true_label, pred_label in zip(y_true, y_pred):
        if true_label == 1 and pred_label == 1:
            TP += 1
        elif true_label == 0 and pred_label == 0:
            TN += 1
        elif true_label == 0 and pred_label == 1:
            FP += 1
        elif true_label == 1 and pred_label == 0:
            FN += 1

    return TP, TN, FP, FN


TP, TN, FP, FN = error_types(y_test, mlp_predictions)

print("True Positives:", TP)
print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)

True Positives: 2182
True Negatives: 3301
False Positives: 978
False Negatives: 776


In [53]:
dump(pipeline_mlp, 'pipeline_mlp.joblib')

['pipeline_mlp.joblib']

## <b><i> Gradient Boosting Classifier (n_estimators=100, learning_rate=0.2, max_depth=10)

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)
# Create a pipeline 
pipeline_gbc = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('gbc', GradientBoostingClassifier(n_estimators=500, learning_rate=0.2, max_depth=5))  
])

# Train the pipeline
pipeline_gbc.fit(X_train, y_train)

In [45]:
y_pred_gbc = pipeline_gbc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_gbc)
print("Accuracy:", accuracy)  

Accuracy: 0.7129250563956888


In [6]:
f1 = f1_score(y_test, y_pred_gbc, average = 'weighted')
print("F1-score:", f1)

precision = precision_score(y_test, y_pred_gbc, average = 'weighted')
print("Precision:", precision)

F1-score: 0.6921023490278615
Precision: 0.7162438712388646


In [41]:
from joblib import dump

dump(pipeline_gbc, 'pipeline_gbc.joblib')

['pipeline_gbc.joblib']

## <b><i> Ada Boost Classifier (n_estimators=100, learning_rate=0.2, random_state = 0)

In [23]:
from sklearn.ensemble import AdaBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=10)
pipeline_abc = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('abc', AdaBoostClassifier(n_estimators=100, learning_rate=0.2, random_state = 10, algorithm = 'SAMME'))  
])

# Train the pipeline
pipeline_abc.fit(X_train, y_train)

In [35]:
from sklearn.ensemble import AdaBoostClassifier

pipeline_abc = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('abc', AdaBoostClassifier(n_estimators=1000, learning_rate=0.5, random_state = 0))  
])

# Train the pipeline
pipeline_abc.fit(X_train, y_train)



In [46]:
y_pred_abc = pipeline_abc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_abc)
print("Accuracy:", accuracy) 

Accuracy: 0.6919542150555602


In [47]:
f1 = f1_score(y_test, y_pred_abc, average = 'weighted')
print("F1-score:", f1)

precision = precision_score(y_test, y_pred_abc, average = 'weighted')
print("Precision:", precision)

F1-score: 0.6852997867352885
Precision: 0.708629276535734


In [48]:
from joblib import dump

dump(pipeline_abc, 'pipeline_abc.joblib')

['pipeline_abc.joblib']