## `NER`

#### `Preprocessing`

In [1]:
import pandas as pd

In [2]:
train_df=pd.read_csv('train_data.csv')
test_df=pd.read_csv('test_data.csv')


In [3]:
### Check Nulls after split 
print('Num of Nulls after splitting for Train Data :', train_df.isna().sum())

print('Num of Nulls after splitting for Test Data :', test_df.isna().sum())


Num of Nulls after splitting for Train Data : headline    3
category    0
dtype: int64
Num of Nulls after splitting for Test Data : headline    0
category    0
dtype: int64


In [4]:
train_df.dropna(inplace=True)

In [5]:
train_df.isna().sum()

headline    0
category    0
dtype: int64

* `NER Features`

In [41]:
import spacy
nlp = spacy.load("en_core_web_sm")

def ner_features(headline):
    doc = nlp(headline)  
    ner_labels = ' '.join(ent.label_ for ent in doc.ents)
    return ner_labels

In [None]:
train_ner= train_df['headline'].apply(ner_features)

test_ner= test_df['headline'].apply(ner_features)


In [46]:

from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
X_train_ner_vectorizer = count_vectorizer.fit_transform(train_ner)
X_test_ner_vectorizer = count_vectorizer.transform(test_ner)

In [47]:
X_train_ner_vectorizer.shape

(66314, 18)

* `NER Words Only`

In [6]:
import spacy

nlp = spacy.load("en_core_web_sm")

def ner_words(headline):
    doc = nlp(headline)  
    ner_entities = [ent.text for ent in doc.ents]  # Extract the entity words
    return ' '.join(ner_entities)  


In [7]:
train_ner= train_df['headline'].apply(ner_words)

test_ner= test_df['headline'].apply(ner_words)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
X_train_ner_vectorizer2 = count_vectorizer.fit_transform(train_ner)
X_test_ner_vectorizer2 = count_vectorizer.transform(test_ner)

## `Implement Models`

In [11]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
import joblib

* `NER Features`

In [49]:

from sklearn.metrics import classification_report


def create_pipeline(model):
    return Pipeline([
        ('classifier', model)
    ])

# Models
models = {
    'SVM': SVC(kernel='linear',C=2),
    'Naive Bayes': MultinomialNB(alpha=0.5),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(50,25), activation='relu', solver='adam', max_iter=300, alpha=0.001)
}

output_dir = "NER_New_Models/"


for model_name, model in models.items():
    print(f"\nTraining {model_name} with NER features...")
    
    
    model_pipeline = create_pipeline(model)
    
    # Train the model
    model_pipeline.fit(X_train_ner_vectorizer, train_df['category'])

    # Make predictions on the train set
    y_pred_train = model_pipeline.predict(X_train_ner_vectorizer)
    
    # Make predictions on the test set
    y_pred_test = model_pipeline.predict(X_test_ner_vectorizer)

    # Train Accuracy 
    print(f"Train Accuracy for {model_name}:")
    print(accuracy_score(train_df['category'], y_pred_train))
    
    # Test Accuracy
    print(f"Test Accuracy for {model_name}:")
    print(accuracy_score(test_df['category'], y_pred_test))

    # Percision Score
    print(f"Percision for {model_name}:")
    print(precision_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Recall Score 
    print(f"Recall for {model_name}:")
    print(recall_score(test_df['category'], y_pred_test,average='weighted'))

    # Classification Report 
    print(f" Classification Report for {model_name}:")
    print(classification_report(test_df['category'], y_pred_test))
    
    # Save Model
    model_file_path = f"{output_dir}{model_name}_ner_new_model.pkl"
    joblib.dump(model_pipeline, model_file_path)
    


Training SVM with NER features...
Train Accuracy for SVM:
0.21215731218143982
Test Accuracy for SVM:
0.21242460796139928
Percision for SVM:
0.16531025593891369
Recall for SVM:
0.21242460796139928
 Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1195
           1       0.24      0.43      0.30      2000
           2       0.00      0.00      0.00      1266
           3       0.13      0.04      0.06      2000
           4       0.27      0.21      0.24      2000
           5       0.00      0.00      0.00      1267
           6       0.21      0.04      0.07      1015
           7       0.24      0.01      0.02      1863
           8       0.19      0.41      0.26      1974
           9       0.20      0.65      0.31      2000

    accuracy                           0.21     16580
   macro avg       0.15      0.18      0.13     16580
weighted avg       0.17      0.21      0.15     16580


Training Na

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train Accuracy for Neural Network:
0.2161836113037971
Test Accuracy for Neural Network:
0.21568154402895054
Percision for Neural Network:
0.21508140582562635
Recall for Neural Network:
0.21568154402895054
 Classification Report for Neural Network:
              precision    recall  f1-score   support

           0       0.27      0.00      0.00      1195
           1       0.24      0.43      0.31      2000
           2       0.00      0.00      0.00      1266
           3       0.18      0.01      0.01      2000
           4       0.26      0.23      0.24      2000
           5       0.41      0.01      0.01      1267
           6       0.21      0.04      0.07      1015
           7       0.18      0.03      0.06      1863
           8       0.20      0.42      0.27      1974
           9       0.20      0.65      0.31      2000

    accuracy                           0.22     16580
   macro avg       0.22      0.18      0.13     16580
weighted avg       0.22      0.22      0.15     

* `NER Words Only`

In [12]:

from sklearn.metrics import classification_report


def create_pipeline(model):
    return Pipeline([
        ('classifier', model)
    ])

# Models
models = {
    'SVM': SVC(kernel='linear',C=5),
    'Naive Bayes': MultinomialNB(alpha=0.5),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(50,25), activation='relu', solver='adam', max_iter=300, alpha=0.001)
}

output_dir = "NER_New_Models/"


for model_name, model in models.items():
    print(f"\nTraining {model_name} with NER features...")
    
    
    model_pipeline = create_pipeline(model)
    
    # Train the model
    model_pipeline.fit(X_train_ner_vectorizer2, train_df['category'])

    # Make predictions on the train set
    y_pred_train = model_pipeline.predict(X_train_ner_vectorizer2)
    
    # Make predictions on the test set
    y_pred_test = model_pipeline.predict(X_test_ner_vectorizer2)

    # Train Accuracy 
    print(f"Train Accuracy for {model_name}:")
    print(accuracy_score(train_df['category'], y_pred_train))
    
    # Test Accuracy
    print(f"Test Accuracy for {model_name}:")
    print(accuracy_score(test_df['category'], y_pred_test))

    # Percision Score
    print(f"Percision for {model_name}:")
    print(precision_score(test_df['category'], y_pred_test,average='weighted'))
    
    # Recall Score 
    print(f"Recall for {model_name}:")
    print(recall_score(test_df['category'], y_pred_test,average='weighted'))

    # Classification Report 
    print(f" Classification Report for {model_name}:")
    print(classification_report(test_df['category'], y_pred_test))
    
    # Save Model
    model_file_path = f"{output_dir}{model_name}_ner_new_model.pkl"
    joblib.dump(model_pipeline, model_file_path)
    


Training SVM with NER features...
Train Accuracy for SVM:
0.6473293723798896
Test Accuracy for SVM:
0.41079613992762365
Percision for SVM:
0.5122209709613632
Recall for SVM:
0.41079613992762365
 Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.45      0.23      0.30      1195
           1       0.56      0.43      0.49      2000
           2       0.59      0.28      0.38      1266
           3       0.24      0.15      0.19      2000
           4       0.64      0.56      0.60      2000
           5       0.56      0.28      0.38      1267
           6       0.64      0.40      0.49      1015
           7       0.69      0.44      0.54      1863
           8       0.63      0.35      0.45      1974
           9       0.22      0.81      0.35      2000

    accuracy                           0.41     16580
   macro avg       0.52      0.39      0.42     16580
weighted avg       0.51      0.41      0.42     16580


Training Naiv