1. Import Libraries

* Import necessary libraries for data manipulation, preprocessing, and modeling.

In [30]:
import pandas as pd
import numpy as np
import string
import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download necessary NLTK resources (only run once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/guyparsadanov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/guyparsadanov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/guyparsadanov/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


2. Load the Dataset

* Load the dataset into a DataFrame. Ensure the data has two columns: label (0 for fake, 1 for true) and text (news content).

In [31]:
# Define the file path and column names
file_path = '/Users/guyparsadanov/Downloads/Iron-Hack-Work/W4/project-3-nlp/training_data_lowercase copy.csv'  # Replace with your file path
column_names = ['label', 'text']

# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path, sep='\t', header=None, names=column_names)

# Convert labels to integers (just in case they aren't)
df['label'] = df['label'].astype(int)

# Display the first few rows of the dataset
print(df.head())

   label                                               text
0      0  donald trump sends out embarrassing new year‚s...
1      0  drunk bragging trump staffer started russian c...
2      0  sheriff david clarke becomes an internet joke ...
3      0  trump is so obsessed he even has obama‚s name ...
4      0  pope francis just called out donald trump duri...


3. Text Preprocessing

* This step cleans the text by removing punctuation, numbers, and stopwords. It also applies lemmatization to reduce words to their base form.

In [33]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    # Remove punctuation and numbers
    text = re.sub(f"[{string.punctuation}]", " ", text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers

    # Remove single characters from the start
    # text = re.sub(r'^[a-zA-Z]\s+', ' ', text)

    # Substitute multiple spaces with a single space
    # text = re.sub(r'\s+', ' ', text).strip()

    # Remove all special characters
    # text = re.sub(r'[^a-zA-Z\s]', ' ', text)  # Keep only letters and spaces
    
    # Convert to lowercase
    # text = text.lower()
    
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the dataset
df['clean_text'] = df['text'].apply(preprocess_text)

# Display the cleaned text
print(df[['text', 'clean_text']].head())

                                                text  \
0  donald trump sends out embarrassing new year‚s...   
1  drunk bragging trump staffer started russian c...   
2  sheriff david clarke becomes an internet joke ...   
3  trump is so obsessed he even has obama‚s name ...   
4  pope francis just called out donald trump duri...   

                                          clean_text  
0  donald trump sends embarrassing new year‚s eve...  
1  drunk bragging trump staffer started russian c...  
2  sheriff david clarke becomes internet joke thr...  
3  trump obsessed even obama‚s name coded website...  
4  pope francis called donald trump christmas speech  


4. Split the Data into Training and Testing Sets

* Divide the data into training and testing sets (80% train, 20% test).

In [34]:
# Define the features (cleaned text) and labels
X = df['clean_text']
y = df['label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

5. Vectorize Text Using BoW and TF-IDF

* Transform the text data into numerical representations.

In [35]:
# Bag of Words (BoW) Vectorization
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

6. Train and Evaluate Classifiers

* Train Logistic Regression and Multinomial Naive Bayes models and evaluate their performance.

In [36]:
# Function to train and evaluate a model
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Print accuracy and classification report
    print(f"Results for {model_name}:\n")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 60)

# Logistic Regression with BoW
log_reg_bow = LogisticRegression()
train_and_evaluate(log_reg_bow, X_train_bow, X_test_bow, y_train, y_test, "Logistic Regression (BoW)")

# Logistic Regression with TF-IDF
log_reg_tfidf = LogisticRegression()
train_and_evaluate(log_reg_tfidf, X_train_tfidf, X_test_tfidf, y_train, y_test, "Logistic Regression (TF-IDF)")

# Multinomial Naive Bayes with BoW
nb_bow = MultinomialNB()
train_and_evaluate(nb_bow, X_train_bow, X_test_bow, y_train, y_test, "Multinomial Naive Bayes (BoW)")

# Multinomial Naive Bayes with TF-IDF
nb_tfidf = MultinomialNB()
train_and_evaluate(nb_tfidf, X_train_tfidf, X_test_tfidf, y_train, y_test, "Multinomial Naive Bayes (TF-IDF)")

Results for Logistic Regression (BoW):

Accuracy: 0.9387

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      3529
           1       0.93      0.95      0.94      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831


Confusion Matrix:
[[3289  240]
 [ 179 3123]]
------------------------------------------------------------
Results for Logistic Regression (TF-IDF):

Accuracy: 0.9353

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      3529
           1       0.93      0.94      0.93      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831


Confusion Matrix:
[[3282  247]
 [ 195 3107]]
----------------------------