In [1]:
import pickle
import pandas as pd
from sklearn import metrics
import numpy as np
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize
from textblob import Word
import gensim
import nltk

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ranee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ranee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ranee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
Test_Data = pd.read_csv("D:\\Uni Related\\NLP\\Project\\news.csv")

In [4]:
with open('selected features.pkl', 'rb') as f:
    encoder = pickle.load(f)
    tfidf_vect = pickle.load(f)
    count_vectorizer = pickle.load(f)

In [5]:
with open('scaling.pkl', 'rb')as f:
    scaler = pickle.load(f)

In [6]:
with open('selected models.pkl', 'rb') as f:
    PassiveAggressive_TFIDF = pickle.load(f)
    LogisticRegression_TFIDF = pickle.load(f)
    NaiveBayesTFIDF = pickle.load(f)
    PassiveAggressiveWord2Vec = pickle.load(f)
    LogisticRegressionWord2Vec = pickle.load(f)
    NaiveBayesWord2Vec = pickle.load(f)
    PassiveAggressiveCountVectorizer = pickle.load(f)
    LogisticRegressionCountVectorizer = pickle.load(f)
    NaiveBayesCountVectorizer = pickle.load(f)

In [7]:
def tokenize_only(text):
    sentences = sent_tokenize(text)
    return sentences

In [8]:
def train_model(classifier, feature_vector_train, Y_actual, model_name, vector_name):
    predictions = classifier.predict(feature_vector_train)
    test_accuracy = metrics.accuracy_score(predictions, Y_actual)
    print(f"Accuracy of {model_name} using {vector_name} : {test_accuracy}\n")

In [9]:
def sentence_to_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [10]:
newsTextList = Test_Data['text'].apply(tokenize_only)

stop = stopwords.words('english')

In [11]:
Test_Data['text'] = Test_Data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
Test_Data['text'] = Test_Data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
Test_Data['text'] = Test_Data['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

Test_Data['title'] = Test_Data['title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
Test_Data['title'] = Test_Data['title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
Test_Data['title'] = Test_Data['title'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

X = Test_Data['text']
Y = Test_Data["label"]

In [12]:
y_encoded = encoder.transform(Y)
x_after_tfidf = tfidf_vect.transform(X)
Word2Vec_model = Word2Vec(X)
x_vector = [sentence_to_vector(sentence, Word2Vec_model) for sentence in X]
X_count_vectorizer = count_vectorizer.transform(X)

In [13]:
accuracy_PassiveAggressiveClassifier_TFIDF = train_model(PassiveAggressive_TFIDF,
                                                         x_after_tfidf, y_encoded, 'PassiveAggressiveClassifier',
                                                         'TF-IDF')
accuracy_LogisticRegression_TFIDF = train_model(LogisticRegression_TFIDF, x_after_tfidf,
                                                y_encoded, 'LogisticRegression', 'TF-IDF')

accuracy_MultinomialNB_TFIDF = train_model(NaiveBayesTFIDF, x_after_tfidf, y_encoded, 'MultinomialNB', 'TF-IDF')

accuracy_PassiveAggressiveClassifier_Word2Vec = train_model(PassiveAggressiveWord2Vec,
                                                            x_vector, y_encoded,
                                                            'PassiveAggressiveClassifier', 'Word2Vec')

accuracy_LogisticRegression_Word2Vec = train_model(LogisticRegressionWord2Vec, x_vector,
                                                   y_encoded, 'LogisticRegression', 'Word2Vec')


Accuracy of PassiveAggressiveClassifier using TF-IDF : 0.9854775059194949

Accuracy of LogisticRegression using TF-IDF : 0.9480662983425414

Accuracy of MultinomialNB using TF-IDF : 0.935438042620363

Accuracy of PassiveAggressiveClassifier using Word2Vec : 0.4702446724546172

Accuracy of LogisticRegression using Word2Vec : 0.4994475138121547



In [14]:
# scaling
x_woerdtovec_scaled = scaler.transform(x_vector)

accuracy_MultinomialNB_Word2Vec = train_model(NaiveBayesWord2Vec, x_woerdtovec_scaled,
                                              y_encoded, 'MultinomialNB', 'Word2Vec')

accuracy_PassiveAggressiveClassifier_CountVectorizer = train_model(PassiveAggressiveCountVectorizer,
                                                                   X_count_vectorizer, y_encoded,
                                                                   'PassiveAggressiveClassifier', 'CountVectorizer')

accuracy_LogisticRegression_CountVectorizer = train_model(LogisticRegressionCountVectorizer, X_count_vectorizer,
                                                          y_encoded,
                                                          'LogisticRegression', 'CountVectorizer')

accuracy_MultinomialNB_CountVectorizer = train_model(NaiveBayesCountVectorizer, X_count_vectorizer,
                                                     y_encoded, 'MultinomialNB', 'CountVectorizer')


Accuracy of MultinomialNB using Word2Vec : 0.4994475138121547

Accuracy of PassiveAggressiveClassifier using CountVectorizer : 0.9747434885556433

Accuracy of LogisticRegression using CountVectorizer : 0.9797947908445146

Accuracy of MultinomialNB using CountVectorizer : 0.9412786108918706

