In [2]:
import os
def read_text_and_title_files(text_folder_path, title_folder_path, label):
    texts = []
    titles = []
    labels = []
    
    text_files = os.listdir(text_folder_path)
    title_files = os.listdir(title_folder_path)
    
    for text_file, title_file in zip(text_files, title_files):
        with open(os.path.join(text_folder_path, text_file), 'r', encoding='utf-8', errors='ignore') as text, \
             open(os.path.join(title_folder_path, title_file), 'r', encoding='utf-8', errors='ignore') as title:
            text_content = text.read()
            title_content = title.read()
            
            texts.append(text_content)
            titles.append(title_content)
            labels.append(label)
    
    return texts, titles, labels

real_texts, real_titles, real_labels = read_text_and_title_files("/Users/abhashshrestha/Downloads/Public Data/Buzzfeed Political News Dataset/Real", "/Users/abhashshrestha/Downloads/Public Data/Buzzfeed Political News Dataset/Real_titles", label=0)
fake_texts, fake_titles, fake_labels = read_text_and_title_files("/Users/abhashshrestha/Downloads/Public Data/Buzzfeed Political News Dataset/Fake", "/Users/abhashshrestha/Downloads/Public Data/Buzzfeed Political News Dataset/Fake_titles", label=1)


In [3]:

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Preprocess all_texts (news content) and all_titles
preprocessed_real_texts = [preprocess_text(text) for text in real_texts]
preprocessed_fake_texts = [preprocess_text(text) for text in fake_texts]

preprocessed_real_titles = [preprocess_text(title) for title in real_titles]
preprocessed_fake_titles = [preprocess_text(title) for title in fake_titles]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abhashshrestha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhashshrestha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


# Combine Real and Fake data
all_texts = real_texts + fake_texts
all_titles = real_titles + fake_titles
all_labels = real_labels + fake_labels

In [11]:
preprocessed_all_texts = [preprocess_text(text) for text in all_texts]
preprocessed_all_titles = [preprocess_text(title) for title in all_titles]
print(preprocessed_all_titles[:2])

['pantsuit power flashmob video hillary clinton two women dancers police', 'new child rape case filed donald trump ignored']


In [12]:
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
texts_vectorized = vectorizer.fit_transform(preprocessed_all_texts)
titles_vectorized = vectorizer.fit_transform(preprocessed_all_titles)
print(texts_vectorized.shape)
print(titles_vectorized.shape)


(101, 5000)
(101, 456)


In [13]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import scipy.sparse
combined_features = scipy.sparse.hstack((texts_vectorized, titles_vectorized))

print(combined_features.shape)


(101, 5456)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(combined_features, all_labels, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression()
model_2 = RandomForestClassifier()
model.fit(X_train, y_train)
model_2.fit(X_train, y_train)

# Predict on the test set
predictions = model.predict(X_test)
predictions_2 = model_2.predict(X_test)


# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy_2 = accuracy_score(y_test, predictions_2)

print(f"Accuracy for Logistic Regression: {accuracy}")
print(f"Accuracy for Random Forest: {accuracy_2}")


Accuracy for Logistic Regression: 0.38095238095238093
Accuracy for Random Forest: 0.7142857142857143


In [19]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters grid
param_grid = {
    'C': [0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print(f"Best Parameters for Logistic: {best_params}")
print(f"Best Accuracy for Logistic: {best_accuracy}")




Best Parameters: {'C': 10.0, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Accuracy: 0.8




In [22]:
param_grid_2 = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    # Add more hyperparameters as needed
}

grid_search_2 = GridSearchCV(RandomForestClassifier(), param_grid_2, cv=5, scoring='accuracy')

grid_search_2.fit(X_train, y_train)

# Get the best parameters and accuracy

best_params_2 = grid_search_2.best_params_
best_accuracy_2 = grid_search_2.best_score_

print(f"Best Parameters for RF: {best_params_2}")
print(f"Best Accuracy for RF: {best_accuracy_2}")

Best Parameters for RF: {'max_depth': None, 'n_estimators': 100}
Best Accuracy for RF: 0.7875
