## Load the Modules

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import gensim.downloader as api

## Splitting the data into 2 parts - train and test data

In [2]:
# Load the cleaned dataset
data = pd.read_csv('cleaned_news_data.csv')

# Combine title_clean and text_clean as the input for the model
data['combined_text'] = data['title_clean'] + ' ' + data['text_clean']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(data['combined_text'], data['label'], test_size=0.2, random_state=42)

### Create a CountVectorizer object, common english words (e.g., "a", "an", "the") will be removed from the text The vectorizer will use the top 10,000 most frequent words in the text to create the feature vectors.

### The 'vectorizer' object is then used to fit and transform the training data (X_train) and transform the test data (X_test) into BoW feature vectors.

In [3]:
# Bag of Words
vectorizer = CountVectorizer(stop_words='english', max_features=10000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)



### Create a TfidfVectorizer object with the same stop_words and max_features parameters as before. The difference between CountVectorizer and TfidfVectorizer is that the latter calculates the Term Frequency-Inverse Document Frequency (TF-IDF) of each word, which is a measure that reflects the importance of a word in the document and the entire corpus.

### The 'vectorizer_tfidf' object is then used to fit and transform the training data (X_train) and transform the test data (X_test) into TF-IDF feature vectors. These feature vectors are stored in the variables 'X_train_tfidf' and 'X_test_tfidf'.

In [4]:
# TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)



### loads the pre-trained GloVe model with 100-dimensional word vectors

### The get_average_glove_vector() function is defined, which takes a text string and the GloVe model as input arguments. Inside the function, the text is split into words. For each word, if it exists in the GloVe model, the corresponding word vector is extracted. If there are no valid word vectors in the text, the function returns a zero vector of the same size as the GloVe model's vector size. Otherwise, the function computes the average of all the word vectors in the text and returns this average vector as the final representation for the input text.

### The function get_average_glove_vector() is applied to each text in the training data (X_train) and test data (X_test) using list comprehensions. The resulting arrays of average GloVe vectors are then converted to NumPy arrays and stored in the variables 'X_train_glove' and 'X_test_glove'.

In [5]:
# Pre-trained word embeddings (GloVe)
glove_model = api.load('glove-wiki-gigaword-100')

def get_average_glove_vector(text, model):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

X_train_glove = np.array([get_average_glove_vector(text, glove_model) for text in X_train])
X_test_glove = np.array([get_average_glove_vector(text, glove_model) for text in X_test])



### Two dictionaries: models and inputs. To store different machine learning models and different input feature types for an NLP classification task. These dictionaries will later be used for running multiple experiments, allowing for easy comparison of model performance using different input features.

In [6]:
# Define models and input types
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Multinomial Naive Bayes': MultinomialNB(),
    'SVM': SVC(random_state=42)
}

inputs = {
    'Bag of Words': (X_train_bow, X_test_bow),
    'TF-IDF': (X_train_tfidf, X_test_tfidf),
    'GloVe': (X_train_glove, X_test_glove)
}



### Iterate over different input types (Bag of Words, TF-IDF, and GloVe) and machine learning models (Logistic Regression, Random Forest, Multinomial Naive Bayes, and SVM) to train, optimize, and evaluate their performance. The objective is to compare the performance of different models using different input feature types for a given text classification task.

In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Store evaluation metrics in a DataFrame
performance_df = pd.DataFrame(columns=['Input Type', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

for input_name, (X_train_input, X_test_input) in inputs.items():
    for model_name, model in models.items():
        #GloVe generates continuous word embeddings, while MNB works with discrete features. 
        #In the case of text classification, MNB typically uses counts of words or tokens 
        #(e.g., Term Frequency or TF-IDF representation). 
        #Mixing these two representations – continuous word embeddings and discrete feature counts – 
        #can lead to poor performance or even incompatibility when using MNB.
        if model_name == 'Multinomial Naive Bayes' and input_name in ['GloVe']:
            continue

        model.fit(X_train_input, y_train)
        y_pred = model.predict(X_test_input)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        performance_df = performance_df.append({
            'Input Type': input_name,
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        }, ignore_index=True)

print(performance_df)

  performance_df = performance_df.append({
  performance_df = performance_df.append({
  performance_df = performance_df.append({
  performance_df = performance_df.append({
  performance_df = performance_df.append({
  performance_df = performance_df.append({
  performance_df = performance_df.append({
  performance_df = performance_df.append({
  performance_df = performance_df.append({
  performance_df = performance_df.append({


      Input Type                    Model  Accuracy  Precision    Recall  \
0   Bag of Words      Logistic Regression  0.994353   0.995044  0.993168   
1   Bag of Words            Random Forest  0.996612   0.996233  0.996702   
2   Bag of Words  Multinomial Naive Bayes  0.943867   0.941773  0.941107   
3   Bag of Words                      SVM  0.992772   0.993858  0.991048   
4         TF-IDF      Logistic Regression  0.985995   0.981538  0.989399   
5         TF-IDF            Random Forest  0.996612   0.996700  0.996231   
6         TF-IDF  Multinomial Naive Bayes  0.928507   0.929387  0.920848   
7         TF-IDF                      SVM  0.993336   0.991776  0.994346   
8          GloVe      Logistic Regression  0.934041   0.929191  0.933569   
9          GloVe            Random Forest  0.946917   0.951879  0.936631   
10         GloVe                      SVM  0.939349   0.935211  0.938516   

    F1 Score  
0   0.994105  
1   0.996467  
2   0.941440  
3   0.992451  
4   0.985453

  performance_df = performance_df.append({


#### We can analyze the performance of each model using different input types by comparing their accuracy, precision, recall, and F1 score. The best models are those with the highest scores across these metrics.

#### Here are the top models for each input type:

#### Bag of Words:
Random Forest with an accuracy of 0.996612, precision of 0.996233, recall of 0.996702, and F1 score of 0.996467.

#### TF-IDF:
Random Forest with an accuracy of 0.996612, precision of 0.996700, recall of 0.996231, and F1 score of 0.996466.

#### GloVe:
Random Forest with an accuracy of 0.946917, precision of 0.951879, recall of 0.936631, and F1 score of 0.944194.

#### Overall, the Random Forest model performs the best across all input types, achieving the highest scores in accuracy, precision, recall, and F1 score. In particular, the Random Forest model with Bag of Words and TF-IDF input types have very similar and impressive performance, with accuracy and F1 scores above 0.996.

#### Thus, we can consider using the Random Forest model with either the Bag of Words or TF-IDF input types.

## Optimisation, based on evaluation of the performance above.

### Reason for choice in parameter:
Logistic Regression:
'C': This parameter controls the inverse of the regularization strength. Smaller values result in stronger regularization, which can help avoid overfitting. We have chosen a range of values that cover both small (stronger regularization) and large (weaker regularization) values, since our model's performance is already quite high, we've included larger values like 100, to explore less regularized models.
'penalty': This parameter determines the type of regularization applied to the model (L1, L2). Different penalties can lead to different feature selection behavior in the model, which might affect the model's performance.
'solver': This parameter defines the optimization algorithm used for training the model. Different solvers can perform differently depending on the problem and data size.

Random Forest:
'n_estimators': This parameter controls the number of trees in the forest. Increasing the number of trees can lead to better performance but also requires more computational resources. Since our model is already performing well, we've focused on a range of higher values.
'max_depth': This parameter defines the maximum depth of each tree. Limiting the depth can help prevent overfitting. We have included a range of values from no limit (None) to moderately deep trees (40).
'min_samples_split': This parameter determines the minimum number of samples required to split an internal node. Higher values help prevent overfitting but can lead to underfitting if too high.
'min_samples_leaf': This parameter controls the minimum number of samples required to be at a leaf node. Increasing this value can help prevent overfitting by creating less complex trees.

Multinomial Naive Bayes:
'alpha': This parameter is a smoothing parameter (Laplace or Lidstone smoothing) applied to the model to handle unseen features in the test data. A range of values is provided to help find the best balance between overfitting and underfitting.

SVM:
'C': This parameter is the regularization parameter, similar to the one in logistic regression. It determines the balance between achieving a low training error and a low testing error (overfitting). We've chosen a range of values to explore different levels of regularization.
'kernel': This parameter defines the kernel function used by the SVM. Different kernel functions can lead to different decision boundaries and affect the model's performance.
'gamma': This parameter is the kernel coefficient for the 'rbf', 'linear' kernels. It controls the shape of the decision boundary. Including 'scale' and 'auto' in the search allows for different scaling strategies, which can impact the model's performance.

In [9]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grids for each model
param_grids = {
    'Logistic Regression': {
        'model': models['Logistic Regression'],
        'params': {
                    'C': [1, 10, 100],
                    'penalty': ['l1', 'l2'],
                    'solver': ['liblinear', 'saga']
                  }  # Define the appropriate parameter grid
    },
    'Random Forest': {
        'model': models['Random Forest'],
        'params': {
                    'n_estimators': [100, 200],
                    'max_depth': [None, 40],
                    'min_samples_split': [2, 5],
                    'min_samples_leaf': [1, 2]
                  }  # Define the appropriate parameter grid
    },
    'Multinomial Naive Bayes': {
        'model': models['Multinomial Naive Bayes'],
        'params': {
                    'alpha': [0.1, 1, 5]
                  }  # Define the appropriate parameter grid
    },
    'SVM': {
        'model': models['SVM'],
        'params': {
                    'C': [1, 10, 100],
                    'kernel': ['linear', 'rbf'],
                    'gamma': ['scale', 'auto']
                  }  # Define the appropriate parameter grid
    }
}

optimized_models = {}

for input_name, (X_train_input, X_test_input) in inputs.items():
    optimized_models[input_name] = {}
    print(f'===== {input_name} =====')
    for name, model_grid in param_grids.items():
        if name == 'Multinomial Naive Bayes' and input_name == 'GloVe':
            # Skip Multinomial Naive Bayes for non-negative input types
            continue
        grid_search = RandomizedSearchCV(estimator=model_grid['model'], param_distributions=model_grid['params'], cv=3, scoring='accuracy', n_jobs=-1, n_iter=5)
        grid_search.fit(X_train_input, y_train)
        best_model = grid_search.best_estimator_
        y_pred_best_model = best_model.predict(X_test_input)

        optimized_models[input_name][name] = best_model

        accuracy_best_model = accuracy_score(y_test, y_pred_best_model)
        precision_best_model = precision_score(y_test, y_pred_best_model)
        recall_best_model = recall_score(y_test, y_pred_best_model)
        f1_best_model = f1_score(y_test, y_pred_best_model)

        print(f'Optimized {name}:')
        print(f'Accuracy: {accuracy_best_model:.2f}')
        print(f'Precision: {precision_best_model:.2f}')
        print(f'Recall: {recall_best_model:.2f}')
        print(f'F1 Score: {f1_best_model:.2f}')
        print(f'Best Parameters: {grid_search.best_params_}')
        print()

===== Bag of Words =====
Optimized Logistic Regression:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1 Score: 0.99
Best Parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 10}

Optimized Random Forest:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}





Optimized Multinomial Naive Bayes:
Accuracy: 0.94
Precision: 0.95
Recall: 0.94
F1 Score: 0.94
Best Parameters: {'alpha': 0.1}

Optimized SVM:
Accuracy: 0.99
Precision: 1.00
Recall: 0.99
F1 Score: 0.99
Best Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'C': 10}

===== TF-IDF =====




Optimized Logistic Regression:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1 Score: 0.99
Best Parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 100}

Optimized Random Forest:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 40}





Optimized Multinomial Naive Bayes:
Accuracy: 0.93
Precision: 0.94
Recall: 0.92
F1 Score: 0.93
Best Parameters: {'alpha': 0.1}

Optimized SVM:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1 Score: 0.99
Best Parameters: {'kernel': 'linear', 'gamma': 'auto', 'C': 10}

===== GloVe =====
Optimized Logistic Regression:
Accuracy: 0.94
Precision: 0.93
Recall: 0.94
F1 Score: 0.93
Best Parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 10}

Optimized Random Forest:
Accuracy: 0.95
Precision: 0.95
Recall: 0.93
F1 Score: 0.94
Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}

Optimized SVM:
Accuracy: 0.94
Precision: 0.93
Recall: 0.94
F1 Score: 0.94
Best Parameters: {'kernel': 'rbf', 'gamma': 'auto', 'C': 100}



#### Note: To ignore the errors and warnings, RandomizedSearchCV can still handle these cases and still return the best results.

## The best model is the Random Forest with Bag of Words input:
## Save best model

In [16]:
import joblib
best_model = optimized_models['Bag of Words']['Random Forest']

#Save model to file
joblib.dump(best_model, 'best_model_rf_bow.pkl')

# Bag of Words
vectorizer = CountVectorizer(stop_words='english', max_features=10000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

# load the modeland make predictions. Tacking the problem - detection of fake news.

In [1]:
import joblib
# Load the saved model
loaded_model = joblib.load('best_model_rf_bow.pkl')

# Load the preprocessor
vectorizer = joblib.load('vectorizer.pkl')

# Making sure the model works 

In [26]:
# Use the loaded model to make predictions
X_test_transformed = vectorizer.transform(X_test)
y_pred_loaded_model = loaded_model.predict(X_test_transformed)
# Calculate the performance metrics
accuracy_loaded_model = accuracy_score(y_test, y_pred_loaded_model)
precision_loaded_model = precision_score(y_test, y_pred_loaded_model)
recall_loaded_model = recall_score(y_test, y_pred_loaded_model)
f1_loaded_model = f1_score(y_test, y_pred_loaded_model)

print(f'Accuracy: {accuracy_loaded_model:.2f}')
print(f'Precision: {precision_loaded_model:.2f}')
print(f'Recall: {recall_loaded_model:.2f}')
print(f'F1 Score: {f1_loaded_model:.2f}')

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00


# Predicting news found online, 

https://www.reuters.com/article/us-usa-fiscal-idUSKBN1EP0LK - copy paste sample from start to end. 

https://web.archive.org/web/20161115024211/http://wtoe5news.com/us-election/pope-francis-shocks-world-endorses-donald-trump-for-president-releases-statement/ - copy paste sample from start to end. 

In [2]:
def predict_fake_news(sample_text):
    # Preprocess the sample text
    preprocessed_sample = vectorizer.transform([sample_text])

    # Make a prediction
    prediction = loaded_model.predict(preprocessed_sample)
    print (prediction)
    if prediction == 1:
        return "Real"
    else:
        return "Fake"

# Prompt the user to enter the sample news text repeatedly
while True:
    sample_text = input("Enter your sample news text (type 0 to exit): ")

    if sample_text == "0":
        break

    result = predict_fake_news(sample_text)
    
    print(f"The given news is {result}.")

Enter your sample news text (type 0 to exit): WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018.  In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January.  When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress.  President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discret

### As U.S. budget fight looms, Republicans flip their fiscal script
True story. Thomson Reuters is dedicated to upholding the Trust Principles and to preserving its independence, integrity, and freedom from bias in the gathering and dissemination of information and news.

### In 2016, a story circulated that Pope Francis made an unprecedented and shocking endorsement of Donald Trump for president.
This story is completely false.
The original story can be traced back to a satire website, but it took off from there and became viral.
There were also other versions of this fake story claiming Pope Francis instead endorsed Hillary Clinton and Bernie Sanders for president.