In [3]:
from sklearn.preprocessing import StandardScaler
from src.components.preprocessor import TextCleaner, TextVectorizer

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_parquet("../artifacts/data.parquet")

In [6]:
from sklearn.pipeline import Pipeline

In [7]:
df = df.copy()

In [8]:
from sklearn.preprocessing import MinMaxScaler

In [9]:
pipeline = Pipeline([
    ('cleaner', TextCleaner()),
    ('vectorizer', TextVectorizer()),
    ('scaler', StandardScaler())
])

In [10]:
X_transformed = pipeline.fit_transform(df['text'])

In [11]:
X_transformed.shape

(3373, 300)

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, df.classes.values, stratify=df.classes.values, random_state=42, test_size=0.2)

In [15]:
rf = RandomForestClassifier()

In [16]:
rf.fit(X_train, y_train)

In [17]:
rf.score(X_transformed, df.classes.values)

0.9973317521494219

In [18]:
rf.score(X_test, y_test)

0.9866666666666667

In [51]:
np.c_[X_train, np.array(y_train)][0, -1]

1.0

In [52]:
from sklearn.neighbors import KNeighborsClassifier

In [53]:
knn = KNeighborsClassifier()

In [54]:
knn.fit(X_train, y_train)

In [55]:
knn.score(X_train, y_train)

0.9655300222386953

In [56]:
knn.score(X_test, y_test)

0.9466666666666667

In [57]:
from sklearn.naive_bayes import GaussianNB

In [58]:
naive = GaussianNB()

In [59]:
naive.fit(X_train, y_train)

In [60]:
naive.score(X_train, y_train)

0.9243884358784284

In [61]:
naive.score(X_test, y_test)

0.9318518518518518

In [27]:
train, test = train_test_split(df, random_state=42, stratify=df.classes.values)

In [29]:
sample = train['text']

In [65]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [63]:
y_train_pred = rf.predict(X_train)

In [64]:
accuracy_score(y_train, y_train_pred)

1.0

In [68]:
models = {
    "Random Forest Classifier": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "K Nearest Neigbors": KNeighborsClassifier()
}

In [78]:
list(models.keys())[2]

'K Nearest Neigbors'

In [80]:
report = {}
for i in range(len(list(models))):
    model = list(models.values())[i]

    model.fit(X_train, y_train)  # Training Model

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_model_score = {
        "Accuracy": accuracy_score(y_train, y_train_pred),
        "Precision": precision_score(y_train, y_train_pred, average='weighted'),
        "Recall": recall_score(y_train, y_train_pred, average='weighted'),
        "f1 score": f1_score(y_train, y_train_pred, average='weighted')
    }
    test_model_score = {
        "Accuracy": accuracy_score(y_test, y_test_pred),
        "Precision": precision_score(y_test, y_test_pred, average='weighted'),
        "Recall": recall_score(y_test, y_test_pred, average='weighted'),
        "f1 score": f1_score(y_test, y_test_pred, average='weighted')
    }

    report[list(models.keys())[i]] = test_model_score

In [84]:
list(report.keys())

['Random Forest Classifier', 'Naive Bayes', 'K Nearest Neigbors']

In [2]:
from joblib import load

In [7]:
model = load("../artifacts/model.pkl")
preprocessor = load("../artifacts/preprocessor.pkl")

In [8]:
def predict(text):
    data_scaled = preprocessor.transform(text)
    return model.predict(data_scaled)

In [11]:
from src.pipeline.predict_pipeline import CustomData

In [16]:
text = "Hweyhwehjhwekjnwkjnckjwnnwlekdlwkemdkjbaejhbajn.chbajcnqhfijqe .jvchiejclmPdi   /kljdf"

obj = CustomData(text)

In [18]:
input = obj.get_data_as_frame()

In [22]:
preprocessor.transform(text)

array([[ 0.72768617,  0.8712145 ,  0.26079056, ...,  0.28591523,
         0.26912665,  0.44834787],
       [-0.08486611,  1.3357975 ,  0.6271399 , ...,  0.00416833,
         0.172692  ,  0.11682823],
       [ 2.276399  ,  1.6059527 ,  2.7206998 , ...,  1.1819034 ,
         1.2653033 , -0.1728774 ],
       ...,
       [ 1.01327   ,  0.975237  ,  1.4248619 , ...,  0.01165509,
         0.4015214 , -0.02147684],
       [ 0.97345257,  0.63209116,  1.4523792 , ...,  0.3910995 ,
         0.18269914,  0.79036784],
       [ 0.5328245 ,  1.2161474 ,  1.1112598 , ...,  0.82107604,
         0.6818662 ,  0.76144683]], dtype=float32)

In [2]:
from src.components.preprocessor import TextCleaner, TextVectorizer
from sklearn.preprocessing import MinMaxScaler

In [3]:
import pandas as pd
import numpy as np
from joblib import load

In [4]:
train = pd.read_pickle("../artifacts/train.pkl")

In [5]:
X = train.text

In [6]:
y = train.classes

In [7]:
cleaner = TextCleaner()

In [8]:
cleaned_text = cleaner.fit_transform(X)

In [9]:
cleaned_text

["where's my money why is it not back in my account firstly i am female not a man stop referring me to sir or he it is rude and disrespectful secondly i'm not the only person here who is getting ripped off other people are too thirdly your chat support is unavailable fourthly middleman again isn't real someone logs in pretends to be a safe secure transactor or uses the account to respond to posts fifthly they close and move all of my threads because they don't want to lose anymore opportunities of taking other people's money they already took 206 from me i told them that i am a vulnerable person and i was taken advantage of and they don't even care they just keep calling me a man and making up excuses for not giving me back my money sixth middleman supposedly told me my money would be back in my bank account within a day or so it's been 3 days and i still don't have my money back my account balance is literally 0 i even have screenshots of my bank account showing i have 0 balance one o

In [10]:
preprocessor = load("../artifacts/preprocessor.pkl")
model = load("../artifacts/model.pkl")

In [11]:
X_scaled = preprocessor.transform(X)

In [12]:
X_scaled.shape

(2698, 300)

In [13]:
model.score(X_scaled, y)

1.0

In [14]:
test = pd.read_pickle("../artifacts/test.pkl")

In [15]:
X_test = test.text
y_test = test.classes

In [16]:
X_test_scaled = preprocessor.transform(X_test)

In [17]:
model.score(X_test_scaled, y_test)

0.9851851851851852

In [18]:
model.predict(X_test_scaled)

array([0., 1., 1., 2., 0., 0., 0., 2., 1., 0., 2., 1., 0., 0., 2., 0., 1.,
       1., 0., 1., 1., 2., 2., 1., 2., 1., 1., 1., 0., 2., 0., 1., 0., 2.,
       0., 0., 1., 1., 0., 1., 0., 0., 2., 1., 0., 0., 2., 2., 0., 0., 2.,
       1., 0., 1., 0., 1., 2., 2., 0., 1., 1., 0., 1., 1., 2., 1., 0., 2.,
       1., 2., 1., 2., 1., 1., 2., 0., 1., 1., 1., 1., 1., 2., 2., 1., 1.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 2., 2., 2., 1., 2.,
       2., 1., 0., 2., 0., 2., 0., 2., 2., 1., 1., 0., 0., 0., 1., 1., 0.,
       2., 1., 0., 2., 0., 1., 0., 2., 2., 1., 1., 2., 0., 1., 1., 2., 1.,
       1., 2., 2., 1., 1., 2., 0., 1., 2., 1., 1., 2., 1., 0., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 2., 0., 2., 1., 2., 1.,
       1., 2., 2., 0., 1., 2., 0., 1., 0., 2., 1., 1., 2., 0., 1., 0., 1.,
       1., 2., 1., 1., 1., 0., 1., 1., 1., 2., 0., 1., 0., 1., 1., 1., 2.,
       2., 1., 2., 1., 2., 1., 0., 1., 1., 0., 1., 1., 1., 1., 2., 2., 1.,
       1., 0., 1., 0., 1.

In [19]:
X_test[1345]

'Dispute over incomplete service delivery. "User: The service was not completed as promised. Service: Our records indicate the service was fully delivered. User: I did not receive all the components of the service. Service: We have documentation proving the service completion."'

In [26]:
model.predict_proba([X_test_scaled[2]])[0]

array([0.07, 0.93, 0.  ])

In [29]:
model.predict([X_test_scaled[2]])[0].item()

1.0

In [40]:
def predict(text):
    pred = model.predict(text)[0]
    prob = model.predict_proba(text)[0]
    return pred, prob

In [41]:
result, proba = predict([X_test_scaled[2]])

In [42]:
result

1.0

In [43]:
proba[0]

0.07

In [25]:
model.classes_

array([0., 1., 2.])