## `POS + NER`

In [1]:
import pandas as pd

In [2]:
train_df=pd.read_csv('train_data.csv')
test_df=pd.read_csv('test_data.csv')


In [3]:
### Check Nulls after split 
print('Num of Nulls after splitting for Train Data :', train_df.isna().sum())

print('Num of Nulls after splitting for Test Data :', test_df.isna().sum())


Num of Nulls after splitting for Train Data : headline    3
category    0
dtype: int64
Num of Nulls after splitting for Test Data : headline    0
category    0
dtype: int64


In [4]:
train_df.dropna(inplace=True)

In [5]:
train_df.isna().sum()

headline    0
category    0
dtype: int64

* `Combine POS and NER`

In [15]:
import spacy

nlp = spacy.load("en_core_web_sm")

def pos_and_ner(headline):
    doc = nlp(headline)  
    
    pos_tags = [token.pos_ for token in doc]  # Extract POS tag

    ner_labels = [ent.label_ for ent in doc.ents]  # Extract entity label
    
    return ' '.join(pos_tags + ner_labels)  # Combine POS and NER labels

In [16]:
train_ner_pos= train_df['headline'].apply(pos_and_ner)

test_ner_pos= test_df['headline'].apply(pos_and_ner)


In [18]:
train_ner_pos.value_counts()

headline
PROPN PROPN PROPN                                                                  131
PROPN PROPN PROPN PROPN                                                            119
PROPN PROPN PROPN PROPN ORG                                                        102
PROPN PROPN                                                                         94
PROPN PROPN PROPN PROPN PROPN ORG                                                   91
                                                                                  ... 
PRON PRON AUX DET PROPN PROPN PROPN                                                  1
INTJ PROPN PROPN PROPN PROPN CCONJ PROPN PROPN ADV PROPN PROPN NOUN                  1
PROPN PROPN PROPN AUX PRON PRON PROPN PROPN PROPN PROPN PROPN PROPN AUX ADV ADV      1
PROPN CCONJ PROPN PROPN ADP SCONJ PRON AUX VERB DET VERB PERSON                      1
PROPN PROPN VERB NOUN ADP PROPN NOUN NORP                                            1
Name: count, Length: 57279, dtype:

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
X_train_ner_vectorizer = count_vectorizer.fit_transform(train_ner_pos)
X_test_ner_vectorizer = count_vectorizer.transform(test_ner_pos)

## `Implement Models`

In [21]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
import joblib

In [22]:

from sklearn.metrics import classification_report


def create_pipeline(model):
    return Pipeline([
        ('classifier', model)
    ])

# Models
models = {
    'SVM': SVC(kernel='linear',C=10),
    'Naive Bayes': MultinomialNB(alpha=0.5),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(50,25), activation='relu', solver='adam', max_iter=500, alpha=0.001,early_stopping=True)
}

output_dir = "NER_POS_Models/"

for model_name, model in models.items():
    print(f"\nTraining {model_name} with NER_POS features...")
    
    
    model_pipeline = create_pipeline(model)
    
    # Train the model
    model_pipeline.fit(X_train_ner_vectorizer, train_df['category'])

    # Make predictions on the train set
    y_pred_train = model_pipeline.predict(X_train_ner_vectorizer)
    
    # Make predictions on the test set
    y_pred_test = model_pipeline.predict(X_test_ner_vectorizer)

    # Train Accuracy 
    print(f"Train Accuracy for {model_name}:")
    print(accuracy_score(train_df['category'], y_pred_train))
    
    # Test Accuracy
    print(f"Test Accuracy for {model_name}:")
    print(accuracy_score(test_df['category'], y_pred_test))

    # Percision Score
    print(f"Percision for {model_name}:")
    print(precision_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Recall Score 
    print(f"Recall for {model_name}:")
    print(recall_score(test_df['category'], y_pred_test,average='weighted'))

    # Classification Report 
    print(f" Classification Report for {model_name}:")
    print(classification_report(test_df['category'], y_pred_test))
    
    # Save Model
    model_file_path = f"{output_dir}{model_name}_ner_pos_model.pkl"
    joblib.dump(model_pipeline, model_file_path)
    


Training SVM with NER_POS features...
Train Accuracy for SVM:
0.2587688874144223
Test Accuracy for SVM:
0.25820265379975876
Percision for SVM:
0.24039569839081576
Recall for SVM:
0.25820265379975876
 Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.31      0.00      0.01      1195
           1       0.29      0.38      0.33      2000
           2       0.23      0.08      0.12      1266
           3       0.16      0.16      0.16      2000
           4       0.29      0.32      0.30      2000
           5       0.00      0.00      0.00      1267
           6       0.28      0.04      0.07      1015
           7       0.29      0.33      0.31      1863
           8       0.27      0.36      0.31      1974
           9       0.25      0.55      0.34      2000

    accuracy                           0.26     16580
   macro avg       0.24      0.22      0.19     16580
weighted avg       0.24      0.26      0.22     16580


Training