In [3]:
from sklearn.preprocessing import StandardScaler
from src.components.preprocessor import TextCleaner, TextVectorizer

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_parquet("../artifacts/data.parquet")

In [6]:
from sklearn.pipeline import Pipeline

In [7]:
df = df.copy()

In [8]:
from sklearn.preprocessing import MinMaxScaler

In [9]:
pipeline = Pipeline([
    ('cleaner', TextCleaner()),
    ('vectorizer', TextVectorizer()),
    ('scaler', StandardScaler())
])

In [10]:
X_transformed = pipeline.fit_transform(df['text'])

In [11]:
X_transformed.shape

(3373, 300)

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, df.classes.values, stratify=df.classes.values, random_state=42, test_size=0.2)

In [15]:
rf = RandomForestClassifier()

In [16]:
rf.fit(X_train, y_train)

In [17]:
rf.score(X_transformed, df.classes.values)

0.9973317521494219

In [18]:
rf.score(X_test, y_test)

0.9866666666666667

In [51]:
np.c_[X_train, np.array(y_train)][0, -1]

1.0

In [52]:
from sklearn.neighbors import KNeighborsClassifier

In [53]:
knn = KNeighborsClassifier()

In [54]:
knn.fit(X_train, y_train)

In [55]:
knn.score(X_train, y_train)

0.9655300222386953

In [56]:
knn.score(X_test, y_test)

0.9466666666666667

In [57]:
from sklearn.naive_bayes import GaussianNB

In [58]:
naive = GaussianNB()

In [59]:
naive.fit(X_train, y_train)

In [60]:
naive.score(X_train, y_train)

0.9243884358784284

In [61]:
naive.score(X_test, y_test)

0.9318518518518518

In [27]:
train, test = train_test_split(df, random_state=42, stratify=df.classes.values)

In [29]:
sample = train['text']

In [65]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [63]:
y_train_pred = rf.predict(X_train)

In [64]:
accuracy_score(y_train, y_train_pred)

1.0

In [68]:
models = {
    "Random Forest Classifier": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "K Nearest Neigbors": KNeighborsClassifier()
}

In [78]:
list(models.keys())[2]

'K Nearest Neigbors'

In [80]:
report = {}
for i in range(len(list(models))):
    model = list(models.values())[i]

    model.fit(X_train, y_train)  # Training Model

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_model_score = {
        "Accuracy": accuracy_score(y_train, y_train_pred),
        "Precision": precision_score(y_train, y_train_pred, average='weighted'),
        "Recall": recall_score(y_train, y_train_pred, average='weighted'),
        "f1 score": f1_score(y_train, y_train_pred, average='weighted')
    }
    test_model_score = {
        "Accuracy": accuracy_score(y_test, y_test_pred),
        "Precision": precision_score(y_test, y_test_pred, average='weighted'),
        "Recall": recall_score(y_test, y_test_pred, average='weighted'),
        "f1 score": f1_score(y_test, y_test_pred, average='weighted')
    }

    report[list(models.keys())[i]] = test_model_score

In [84]:
list(report.keys())

['Random Forest Classifier', 'Naive Bayes', 'K Nearest Neigbors']

In [85]:
from joblib import load

In [86]:
model = load("../artifacts/model.pkl")