In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv("/kaggle/input/preprocessed-textual-data/preprocessed_text_data.csv")

In [4]:
df = data

In [5]:
df.head()

Unnamed: 0,text,label,Original Text,word_count
0,oh gosh,Anxiety,oh my gosh,3
1,trouble sleep confuse mind restless heart tune,Anxiety,"trouble sleeping, confused mind, restless hear...",10
2,wrong back dear forward doubt stay restless re...,Anxiety,"All wrong, back off dear, forward doubt. Stay ...",14
3,ive shift focus something else im still worry,Anxiety,I've shifted my focus to something else but I'...,11
4,im restless restless month boy mean,Anxiety,"I'm restless and restless, it's been a month n...",14


## Baseline model

**RandomForestClassifier**

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [20]:
def evaluate_vectorizers(texts, labels):
    vectorizers = {
        "TFIDF-bigram": TfidfVectorizer(ngram_range=(1, 2)),
        "TFIDF-trigram" : TfidfVectorizer(ngram_range=(1,3))
    }
    best_accuracy = 0
    best_vectorizer_name = None
    best_vectorizer = None

    print("Checking different vectorizations with Random Forest:\n")
    for name, vectorizer in vectorizers.items():
        X = vectorizer.fit_transform(texts)
        X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        print(f"{name}: Accuracy = {acc:.4f}")

        if acc > best_accuracy:
            best_accuracy = acc
            best_vectorizer_name = name
            best_vectorizer = vectorizer

    print(f"\nBest Vectorizer: {best_vectorizer_name} with Accuracy = {best_accuracy:.4f}")
    return best_vectorizer_name, best_vectorizer


In [15]:
best_vec_name, best_vec = evaluate_vectorizers(df['text'], df['label'])

Checking different vectorizations with Random Forest:

TFIDF-bigram: Accuracy = 0.6255

Best Vectorizer: TFIDF-bigram with Accuracy = 0.6255


#### Baseline model performs well with bigram. Therefore Tfidf-bi_gram is choosen for further modelling.

In [14]:
df.isnull().sum()

text             0
label            0
Original Text    0
word_count       0
dtype: int64

In [11]:
df.dropna(inplace  = True)

In [12]:
df.reset_index(inplace = True
              )

In [13]:
df.drop("index",axis = 1,inplace =True)

In [17]:
import pickle

In [21]:
## Save the best vectorizer

with open("vectorizer.pkl","wb")as file:
    pickle.dump(best_vec,file)

In [22]:
X = df["text"]

In [25]:
X = best_vec.transform(df['text'])

In [29]:
with open("vectorized_data.pkl","wb")as file:
    pickle.dump(X,file)