## `Part Of Speech`

##### `Preprocessing`

In [17]:
import pandas as pd


In [18]:
train_df=pd.read_csv('train_data.csv')
test_df=pd.read_csv('test_data.csv')


In [19]:
### Check Nulls after split 
print('Num of Nulls after splitting for Train Data :', train_df.isna().sum())

print('Num of Nulls after splitting for Test Data :', test_df.isna().sum())


Num of Nulls after splitting for Train Data : headline    3
category    0
dtype: int64
Num of Nulls after splitting for Test Data : headline    0
category    0
dtype: int64


In [20]:
train_df.dropna(inplace=True)

In [21]:
train_df['headline']

0                  Our Most Fearless Tweet Finalist PHOTOS
1                                             Hunger Hurts
2        Perfect Tweets About Bachelor In Paradise Seas...
3                            Nuh Linga Get Down to Jamaica
4        Airplane Boneyards Look Even Cooler In Instagr...
                               ...                        
66312    Osteochondral Ankle Surgery Is This What ShinS...
66313                         Americas Most Damaged Brands
66314    Grief and Loss Tips on How We Can Help Those A...
66315    Beyonce Taylor Swift And Other Celebrities Sen...
66316       Hillary President The Elephant in the RoomBill
Name: headline, Length: 66314, dtype: object

### `POS`

* `Spacy`

In [22]:
import spacy

nlp = spacy.load("en_core_web_sm")

def clean_and_extract_pos(text):
    
    text = text.lower()  
    doc = nlp(text)
    # Extract POS tags
    pos_tags = [token.pos_ for token in doc]
    
    return ' '.join(pos_tags)


In [23]:
train_df['headline']=train_df['headline'].apply(clean_and_extract_pos)
test_df['headline']=test_df['headline'].apply(clean_and_extract_pos)


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(analyzer=lambda x: x,max_features=1000)  # We pass the tokenized list directly

X_train_pos = bow_vectorizer.fit_transform(train_df['headline'])
X_test_pos = bow_vectorizer.transform(test_df['headline'])

## `Implement Models` 

In [25]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score , classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
import joblib

* `Spacy`

In [11]:

def create_pipeline(model):
    return Pipeline([
        ('classifier', model)
    ])

# Models with updated parameters
models = {
    'SVM': SVC(kernel='linear', C=10, class_weight='balanced'),  
    'Naive Bayes': MultinomialNB(alpha=0.01, fit_prior=True),    
    'Neural Network': MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', alpha=0.0001, max_iter=1000)  
}

output_dir = "POS_Spacy_Models/"


for model_name, model in models.items():
    print(f"\nTraining {model_name} with POS Spacy features...")
    
    
    model_pipeline = create_pipeline(model)
    
    # Train the model
    model_pipeline.fit(X_train_pos, train_df['category'])

    # Make predictions on the train set
    y_pred_train = model_pipeline.predict(X_train_pos)
    
    # Make predictions on the test set
    y_pred_test = model_pipeline.predict(X_test_pos)

    # Train Accuracy 
    print(f"Train Accuracy for {model_name}:")
    print(accuracy_score(train_df['category'], y_pred_train))
    
    # Test Accuracy
    print(f"Test Accuracy for {model_name}:")
    print(accuracy_score(test_df['category'], y_pred_test))

    # Precision Score
    print(f"Precision for {model_name}:")
    print(precision_score(test_df['category'], y_pred_test, average='weighted'))
    
    # Recall Score 
    print(f"Recall for {model_name}:")
    print(recall_score(test_df['category'], y_pred_test, average='weighted'))

    # Classification Report 
    print(f"Classification Report for {model_name}:")
    print(classification_report(test_df['category'], y_pred_test))
    
    # Save Model
    model_file_path = f"{output_dir}{model_name}_pos_spacy_model.pkl"
    joblib.dump(model_pipeline, model_file_path)



Training SVM with POS Spacy features...
Train Accuracy for SVM:
0.22284887052507765
Test Accuracy for SVM:
0.22165259348612787
Precision for SVM:
0.22849996454029273
Recall for SVM:
0.22165259348612787
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.12      0.06      0.08      1195
           1       0.28      0.16      0.20      2000
           2       0.17      0.38      0.23      1266
           3       0.19      0.12      0.14      2000
           4       0.28      0.15      0.20      2000
           5       0.17      0.15      0.16      1267
           6       0.12      0.20      0.15      1015
           7       0.29      0.32      0.30      1863
           8       0.26      0.21      0.23      1974
           9       0.27      0.42      0.33      2000

    accuracy                           0.22     16580
   macro avg       0.21      0.22      0.20     16580
weighted avg       0.23      0.22      0.21     16580


Traini

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train Accuracy for Neural Network:
0.3227071206683355
Test Accuracy for Neural Network:
0.2425211097708082
Precision for Neural Network:
0.2281703730705509
Recall for Neural Network:
0.2425211097708082
Classification Report for Neural Network:
              precision    recall  f1-score   support

           0       0.17      0.05      0.08      1195
           1       0.23      0.36      0.28      2000
           2       0.25      0.16      0.19      1266
           3       0.16      0.15      0.15      2000
           4       0.22      0.28      0.25      2000
           5       0.18      0.09      0.12      1267
           6       0.12      0.02      0.03      1015
           7       0.30      0.32      0.31      1863
           8       0.25      0.36      0.30      1974
           9       0.31      0.38      0.34      2000

    accuracy                           0.24     16580
   macro avg       0.22      0.22      0.20     16580
weighted avg       0.23      0.24      0.22     1658