## `Bag Of Words`

##### `Preprocessing`

In [47]:

import pandas as pd

In [48]:
train_df=pd.read_csv('train_data.csv')
test_df=pd.read_csv('test_data.csv')


In [49]:
### Check Nulls after split 
print('Num of Nulls after splitting for Train Data :', train_df.isna().sum())

print('Num of Nulls after splitting for Test Data :', test_df.isna().sum())


Num of Nulls after splitting for Train Data : headline    3
category    0
dtype: int64
Num of Nulls after splitting for Test Data : headline    0
category    0
dtype: int64


In [50]:
train_df.dropna(inplace=True)

In [51]:
train_df.isna().sum()


headline    0
category    0
dtype: int64

* `With Stemming`

In [43]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Initialize the stemmer
stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))

def clean_text_for_bow(text):

    text = text.lower()

    tokens = word_tokenize(text)

    tokens = [word for word in tokens if word not in stop_words]
    
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

# Apply stem function on df[headline]
train_df['headline'] = train_df['headline'].apply(clean_text_for_bow)

test_df['headline'] = test_df['headline'].apply(clean_text_for_bow)


In [44]:
train_df['headline'].shape  , test_df['headline'].shape

((66314,), (16580,))

In [45]:
train_df['headline']

0                       [fearless, tweet, finalist, photo]
1                                           [hunger, hurt]
2        [perfect, tweet, bachelor, paradis, season, fi...
3                               [nuh, linga, get, jamaica]
4        [airplan, boneyard, look, even, cooler, instag...
                               ...                        
66312    [osteochondr, ankl, surgeri, shinsoo, choo, to...
66313                              [america, damag, brand]
66314                     [grief, loss, tip, help, affect]
66315    [beyonc, taylor, swift, celebr, send, prayer, ...
66316                    [hillari, presid, eleph, roombil]
Name: headline, Length: 66314, dtype: object

* `With Lemmatization`

In [52]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def clean_text_for_bow(text):

    text = text.lower()

    tokens = word_tokenize(text)
    
    tokens = [word for word in tokens if word not in stop_words]

    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

# Apply lemmatizer function on df[headline]
train_df['headline'] = train_df['headline'].apply(clean_text_for_bow)

test_df['headline'] = test_df['headline'].apply(clean_text_for_bow)


In [53]:
train_df['headline']

0                       [fearless, tweet, finalist, photo]
1                                           [hunger, hurt]
2        [perfect, tweet, bachelor, paradise, season, f...
3                               [nuh, linga, get, jamaica]
4        [airplane, boneyards, look, even, cooler, inst...
                               ...                        
66312    [osteochondral, ankle, surgery, shinsoo, choos...
66313                            [america, damaged, brand]
66314                   [grief, loss, tip, help, affected]
66315    [beyonce, taylor, swift, celebrity, send, pray...
66316             [hillary, president, elephant, roombill]
Name: headline, Length: 66314, dtype: object

* `BOW Using CountVectorizer`

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(analyzer=lambda x: x,max_features=1000)  # pass the tokenized list directly

## Apply Transformer
X_train_bow = bow_vectorizer.fit_transform(train_df['headline'])
X_test_bow = bow_vectorizer.transform(test_df['headline'])

In [55]:
X_train_bow.shape , X_test_bow.shape


((66314, 1000), (16580, 1000))

* `BOW using TfidfVectorizer `

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(analyzer=lambda x: x, max_features=1000) 
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['headline'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['headline'])


In [58]:

train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [59]:
train_tfidf_df

Unnamed: 0,abuse,according,accused,act,action,activist,actor,actress,actually,ad,...,yearold,yes,yet,yoga,york,youll,young,youre,youth,youve
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## `Implement Models` 

In [60]:
from sklearn.svm import SVC , LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
import joblib

##### `Grid Search`

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

## For SVM Model
svm_model = SVC()
param_grid_svm = {'C': [0.1, 1], 'max_iter': [1000], 'kernel': ['linear', 'poly']}
grid_search_svm = GridSearchCV(svm_model, param_grid_svm, cv=3)

grid_search_svm.fit(X_train_tfidf, train_df['category'])

print(f"Best SVM Params: {grid_search_svm.best_params_}")

best_svm = grid_search_svm.best_estimator_

* `With Stemming`

###### `1- Using Tfidf ` 

In [17]:

def create_pipeline(model):
    return Pipeline([
        ('classifier', model)
    ])

# Models
models = {
    'SVM': SVC(kernel='linear',C=1),
    'Naive Bayes': MultinomialNB(alpha=.5),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(80,40), max_iter=1000,activation='relu',solver='adam',early_stopping=True)
}

output_dir = "BOW_Stem_Tfidf_Models"

for model_name, model in models.items():
    print(f"\nTraining {model_name} with Stemming...")
    
    # Create the pipeline
    model_pipeline = create_pipeline(model)
    
    # Train the model
    model_pipeline.fit(X_train_tfidf, train_df['category'])
    
    # Make predictions on the train set
    y_pred_train = model_pipeline.predict(X_train_tfidf)
    
    # Make predictions on the test set
    y_pred_test = model_pipeline.predict(X_test_tfidf)
    
    # Train Accuracy 
    print(f"Train Accuracy for {model_name}:")
    print(accuracy_score(train_df['category'], y_pred_train))
    
    # Test Accuracy
    print(f"Test Accuracy for {model_name}:")
    print(accuracy_score(test_df['category'], y_pred_test))

    # Percision Score
    print(f"Percision for {model_name}:")
    print(precision_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Recall Score 
    print(f"Recall for {model_name}:")
    print(recall_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Save Model
    model_file_path = f"{output_dir}{model_name}_bow_stem_model.pkl"
    joblib.dump(model_pipeline, model_file_path)
    


Training SVM with Stemming...
Train Accuracy for SVM:
0.8405314111650631
Test Accuracy for SVM:
0.7174909529553679
Percision for SVM:
0.7234980040657505
Recall for SVM:
0.7174909529553679

Training Naive Bayes with Stemming...
Train Accuracy for Naive Bayes:
0.7924118587326959
Test Accuracy for Naive Bayes:
0.6978890229191798
Percision for Naive Bayes:
0.7051746740694199
Recall for Naive Bayes:
0.6978890229191798

Training Neural Network with Stemming...
Train Accuracy for Neural Network:
0.83075971891305
Test Accuracy for Neural Network:
0.719059107358263
Percision for Neural Network:
0.7212129570489031
Recall for Neural Network:
0.719059107358263


###### `2- Using Countvectorizer`

In [19]:

def create_pipeline(model):
    return Pipeline([
        ('classifier', model)
    ])

# Models
models = {
    'SVM': SVC(kernel='linear',C=1.5),
    'Naive Bayes': MultinomialNB(alpha=.5),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(70,35), max_iter=1000,activation='relu',solver='adam',early_stopping=True)
}

output_dir = "BOW_Stem_Countvectorizer_Models"

for model_name, model in models.items():
    print(f"\nTraining {model_name} with Stemming...")
    
    
    model_pipeline = create_pipeline(model)
    
    # Train the model
    model_pipeline.fit(X_train_bow, train_df['category'])
    
    # Make predictions on the train set
    y_pred_train = model_pipeline.predict(X_train_bow)
    
    # Make predictions on the test set
    y_pred_test = model_pipeline.predict(X_test_bow)
    
    # Train Accuracy 
    print(f"Train Accuracy for {model_name}:")
    print(accuracy_score(train_df['category'], y_pred_train))
    
    # Test Accuracy
    print(f"Test Accuracy for {model_name}:")
    print(accuracy_score(test_df['category'], y_pred_test))

    # Percision Score
    print(f"Percision for {model_name}:")
    print(precision_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Recall Score 
    print(f"Recall for {model_name}:")
    print(recall_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Save Model
    model_file_path = f"{output_dir}{model_name}_stem_model.pkl"
    joblib.dump(model_pipeline, model_file_path)
    


Training SVM with Stemming...
Train Accuracy for SVM:
0.9306933679162771
Test Accuracy for SVM:
0.6869722557297949
Percision for SVM:
0.6918293227894592
Recall for SVM:
0.6869722557297949

Training Naive Bayes with Stemming...
Train Accuracy for Naive Bayes:
0.7985040866182104
Test Accuracy for Naive Bayes:
0.7118214716525935
Percision for Naive Bayes:
0.7055087794895764
Recall for Naive Bayes:
0.7118214716525935

Training Neural Network with Stemming...
Train Accuracy for Neural Network:
0.8425068612962572
Test Accuracy for Neural Network:
0.7197828709288299
Percision for Neural Network:
0.7192039806615848
Recall for Neural Network:
0.7197828709288299


* `With Lemmatization & linear SVC`

###### `1- Using Countvectorizer`

In [50]:

def create_pipeline(model):
    return Pipeline([
        ('classifier', model)
    ])

# Models
models = {
    'Linear SVM': LinearSVC(C=.05),
    'Naive Bayes': MultinomialNB(alpha=.3),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(80,40), max_iter=1000,activation='relu',solver='adam',early_stopping=True)
}

output_dir = "BOW_Lemmatize_Countvectorizer_Models"


for model_name, model in models.items():
    print(f"\nTraining {model_name} with Lemmatization...")
    
    
    model_pipeline = create_pipeline(model)
    
    # Train the model
    model_pipeline.fit(X_train_bow, train_df['category'])
    
    # Make predictions on the train set
    y_pred_train = model_pipeline.predict(X_train_bow)
    
    # Make predictions on the test set
    y_pred_test = model_pipeline.predict(X_test_bow)
    
    # Train Accuracy 
    print(f"Train Accuracy for {model_name}:")
    print(accuracy_score(train_df['category'], y_pred_train))
    
    # Test Accuracy
    print(f"Test Accuracy for {model_name}:")
    print(accuracy_score(test_df['category'], y_pred_test))

    # Percision Score
    print(f"Percision for {model_name}:")
    print(precision_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Recall Score 
    print(f"Recall for {model_name}:")
    print(recall_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Save Model
    model_file_path = f"{output_dir}{model_name}_lemmatize_model.pkl"
    joblib.dump(model_pipeline, model_file_path)
    


Training Linear SVM with Lemmatization...
Train Accuracy for Linear SVM:
0.8356455650390566
Test Accuracy for Linear SVM:
0.7265983112183353
Percision for Linear SVM:
0.7237217220855323
Recall for Linear SVM:
0.7265983112183353

Training Naive Bayes with Lemmatization...
Train Accuracy for Naive Bayes:
0.8143981662997255
Test Accuracy for Naive Bayes:
0.7074185765983112
Percision for Naive Bayes:
0.7006290362471244
Recall for Naive Bayes:
0.7074185765983112

Training Neural Network with Lemmatization...
Train Accuracy for Neural Network:
0.8666043369424254
Test Accuracy for Neural Network:
0.7215922798552473
Percision for Neural Network:
0.7212738554769985
Recall for Neural Network:
0.7215922798552473


###### `2- Using tfidf`

In [67]:

def create_pipeline(model):
    return Pipeline([
        ('classifier', model)
    ])

# Models
models = {
    'Linear SVM': LinearSVC( C=.1,random_state=45),
    'Naive Bayes': MultinomialNB(alpha=.45),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(64,32), max_iter=1000,activation='relu',solver='adam',early_stopping=True)
}

output_dir = "BOW_Lemmatize_Tfidf_Models"

for model_name, model in models.items():
    print(f"\nTraining {model_name} with BoW features...")
    

    model_pipeline = create_pipeline(model)
    
    # Train the model
    model_pipeline.fit(X_train_tfidf, train_df['category'])
    
    # Make predictions on the train set
    y_pred_train = model_pipeline.predict(X_train_tfidf)
    
    # Make predictions on the test set
    y_pred_test = model_pipeline.predict(X_test_tfidf)
    
    # Train Accuracy 
    print(f"Train Accuracy for {model_name}:")
    print(accuracy_score(train_df['category'], y_pred_train))
    
    # Test Accuracy
    print(f"Test Accuracy for {model_name}:")
    print(accuracy_score(test_df['category'], y_pred_test))

    # Percision Score
    print(f"Percision for {model_name}:")
    print(precision_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Recall Score 
    print(f"Recall for {model_name}:")
    print(recall_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Save Model
    model_file_path = f"{output_dir}{model_name}_bow_model.pkl"
    joblib.dump(model_pipeline, model_file_path)
    


Training Linear SVM with BoW features...
Train Accuracy for Linear SVM:
0.8110504569170914
Test Accuracy for Linear SVM:
0.7237032569360675
Percision for Linear SVM:
0.7194125537869339
Recall for Linear SVM:
0.7237032569360675

Training Naive Bayes with BoW features...
Train Accuracy for Naive Bayes:
0.8054407817353801
Test Accuracy for Naive Bayes:
0.6966224366706876
Percision for Naive Bayes:
0.7009499847600421
Recall for Naive Bayes:
0.6966224366706876

Training Neural Network with BoW features...
Train Accuracy for Neural Network:
0.8302168471212715
Test Accuracy for Neural Network:
0.7155609167671894
Percision for Neural Network:
0.7184208530537506
Recall for Neural Network:
0.7155609167671894


#### `HMM`

In [12]:

label_mapping = {
    'BUSINESS': 0,
    'ENTERTAINMENT': 1,
    'FOOD & DRINK': 2,
    'OTHER': 3,
    'POLITICS': 4,
    'QUEER VOICES': 5,
    'SPORTS': 6,
    'STYLE & BEAUTY': 7,
    'TRAVEL': 8,
    'WELLNESS': 9
}


class_name = {value: key for key, value in label_mapping.items()}


In [13]:
# Convert sparse matrix to dense
X_train_dense = X_train_bow.toarray()
X_test_dense = X_test_bow.toarray()


In [14]:
print(X_train_dense.shape)

(66314, 1000)


In [15]:
from hmmlearn.hmm import GaussianHMM

hmm_model = GaussianHMM(n_components=5, covariance_type="diag", n_iter=100, random_state=42)

hmm_model.fit(X_train_dense)  


In [21]:
from sklearn.metrics import accuracy_score
# Predict on training and test data
train_pred = hmm_model.predict(X_train_dense)
test_pred = hmm_model.predict(X_test_dense)

# Evaluate accuracy
train_accuracy = accuracy_score(train_df['category'], train_pred)
test_accuracy = accuracy_score(test_df['category'], test_pred)


print("\nHidden Markov Model:")
print(f"Training Accuracy: {train_accuracy : .2f}")
print(f"Test Accuracy: {test_accuracy: .2f}")


Hidden Markov Model:
Training Accuracy:  0.12
Test Accuracy:  0.12


In [19]:
# Map states to class names
predicted_classes = [class_name[state] for state in test_pred]

print("Predicted Classes:", predicted_classes)

Predicted Classes: ['OTHER', 'OTHER', 'ENTERTAINMENT', 'OTHER', 'BUSINESS', 'BUSINESS', 'ENTERTAINMENT', 'OTHER', 'BUSINESS', 'BUSINESS', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'BUSINESS', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'BUSINESS', 'FOOD & DRINK', 'OTHER', 'BUSINESS', 'POLITICS', 'FOOD & DRINK', 'POLITICS', 'OTHER', 'ENTERTAINMENT', 'OTHER', 'BUSINESS', 'POLITICS', 'OTHER', 'ENTERTAINMENT', 'POLITICS', 'OTHER', 'FOOD & DRINK', 'OTHER', 'OTHER', 'OTHER', 'FOOD & DRINK', 'ENTERTAINMENT', 'FOOD & DRINK', 'OTHER', 'OTHER', 'FOOD & DRINK', 'FOOD & DRINK', 'OTHER', 'FOOD & DRINK', 'OTHER', 'FOOD & DRINK', 'OTHER', 'ENTERTAINMENT', 'OTHER', 'BUSINESS', 'OTHER', 'FOOD & DRINK', 'BUSINESS', 'OTHER', 'OTHER', 'BUSINESS', 'OTHER', 'ENTERTAINMENT', 'ENTERTAINMENT', 'BUSINESS', 'ENTERTAINMENT', 'ENTERTAINMENT', 'FOOD & DRINK', 'ENTERTAINMENT', 'OTHER', 'OTHER', 'OTHER', 'ENTERTAINMENT', 'OTHER', 'OTHER', 'FOOD & DRINK', 'OTHER', 'OTHER', 'OTHER', 'FOOD