In [6]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from tqdm import tqdm

# Reads the csv-file into a pandas dataframe
test = pd.read_csv('news_cleaned_2018_02_13.csv', chunksize=100000)

# Initialize nltk's PorterStemmer
stemmer = PorterStemmer()

# Cleans the data
def clean_content(inp):
    for i in tqdm(range(len(inp))):
        # Converting all content to lower case letters.
        inp = inp.applymap(lambda x:x.lower() if type(x) == str else x)

        # Uses regular expressions to remove and or substitute unwanted substrings with dummy substrings.
        # Removes all newlines and tabs
        inp.at[i,'content'] = re.sub(r"[\n\t]*", "", inp.at[i,'content'])
        # Removes all whitespace that is instantly after a whitespace
        inp.at[i,'content'] = re.sub(r"[\s]{2,}", "", inp.at[i,'content'])
        # Sub of dates
        inp.at[i,'content'] = re.sub(r"(([a-zA-Z]*)(\s+)(\d{2,})(,{1})(\s+)(\d{2,4}))","uniquedate", inp.at[i,'content'], flags=re.MULTILINE)
        # Sub of emails
        inp.at[i,'content'] = re.sub(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)", "uniqueemail", inp.at[i,'content'], flags=re.MULTILINE)
        # Sub of url's
        inp.at[i,'content'] = re.sub(r"(?:https?:\/\/)?(?:www\.)?([^@\s]+\.[a-zA-Z]{2,4})[^\s]*","uniqueurl", inp.at[i,'content'], flags=re.MULTILINE)
        #Removal of numbers - (\s)\$?(?:[\d,.-])+
        inp.at[i,'content'] = re.sub(r"\b(\d+)\b","uniquenum", inp.at[i,'content'], flags=re.MULTILINE)
    return inp


data = next(test)

# Clean the data
data = clean_content(data)

# Save the cleaned data to a new CSV file
data.to_csv('news_cleaned.csv', index=False)



  0%|          | 4/100000 [00:12<87:50:28,  3.16s/it]


KeyboardInterrupt: 

In [7]:
import pandas as pd
import nltk 

# read in the data and sample 10% of the rows
data = pd.read_csv('news_cleaned.csv').sample(frac=0.1)

# Tokenize the text
data['tokens'] = data['content'].apply(nltk.word_tokenize)

# Remove stopwords and compute the size of the vocabulary
stop_words = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])
vocab_size = len(set([word for row in data['tokens'] for word in row]))
reduction_rate_stopwords = (1 - (vocab_size / len(set([word for row in data['content'] for word in row.split()])))) * 100

# Remove word variations with stemming and compute the size of the vocabulary
ps = PorterStemmer()
data['tokens'] = data['tokens'].apply(lambda x: [ps.stem(word) for word in x])
vocab_size_stem = len(set([word for row in data['tokens'] for word in row]))
reduction_rate_stemming = (1 - (vocab_size_stem / vocab_size)) * 100

print(f"Vocabulary size before removing stopwords: {len(set([word for row in data['content'] for word in row.split()]))}")
print(f"Vocabulary size after removing stopwords: {vocab_size}")
print(f"Reduction rate after removing stopwords: {reduction_rate_stopwords:.2f}%")
print(f"Vocabulary size after stemming: {vocab_size_stem}")
print(f"Reduction rate after stemming: {reduction_rate_stemming:.2f}%")



Vocabulary size before removing stopwords: 862462
Vocabulary size after removing stopwords: 862165
Reduction rate after removing stopwords: 0.03%
Vocabulary size after stemming: 794997
Reduction rate after stemming: 7.79%


In [8]:
import numpy as np

# Define the split ratios
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# Split the data into training, validation, and test sets
num_rows = len(data)
indices = np.arange(num_rows)
np.random.shuffle(indices)
train_idx = indices[:int(train_ratio*num_rows)]
val_idx = indices[int(train_ratio*num_rows):int((train_ratio+val_ratio)*num_rows)]
test_idx = indices[int((train_ratio+val_ratio)*num_rows):]

train_data = data.iloc[train_idx]
val_data = data.iloc[val_idx]
test_data = data.iloc[test_idx]


In [20]:
import random

def random_baseline(data):
    data['predicted'] = [random.choice(['reliable', 'fake']) for _ in range(len(data))]
    return data

def calculate_accuracy(data, true_labels):
    correct_predictions = data[data['predicted'] == true_labels]
    accuracy = len(correct_predictions) / len(data)
    return accuracy

# Example usage:
random_data = random_baseline(test_data)
accuracy = calculate_accuracy(random_data, test_data['type'])
print("Random Baseline Accuracy:", accuracy)


Random Baseline Accuracy: 0.243


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['predicted'] = [random.choice(['reliable', 'fake']) for _ in range(len(data))]


In [25]:
from sklearn.metrics import classification_report

# Generate random predictions
random_data = random_baseline(test_data)

y_true = test_data['type'].values
y_pred = random_data['predicted'].astype(str).values
report = classification_report(y_true, y_pred)
print("Classification report for random baseline:\n", report)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['predicted'] = [random.choice(['reliable', 'fake']) for _ in range(len(data))]


TypeError: '<' not supported between instances of 'float' and 'str'

In [39]:
import pandas as pd
import random

# Load data
data = pd.read_csv("news_cleaned.csv")

# Define train, validation, and test sets
train_size = int(len(data) * 0.6)
val_size = int(len(data) * 0.2)
test_size = len(data) - train_size - val_size

train_data = data[:train_size]
val_data = data[train_size:train_size + val_size]
test_data = data[train_size + val_size:]

# Define Most Common Label model
def MCL(train_data, val_data):
    # Get most common label from training set
    mcl = train_data['type'].value_counts().idxmax()

    # Make predictions for training and validation sets
    train_data['predicted'] = mcl
    val_data['predicted'] = mcl

    # Calculate accuracy
    train_accuracy = (train_data['type'] == train_data['predicted']).mean()
    val_accuracy = (val_data['type'] == val_data['predicted']).mean()

    return train_accuracy, val_accuracy

# Run MCL model on data
train_accuracy, val_accuracy = MCL(train_data, val_data)

# Print results
print("Most Common Label model:")
print(f"Training accuracy: {train_accuracy:.3f}")
print(f"Validation accuracy: {val_accuracy:.3f}")



Most Common Label model:
Training accuracy: 0.492
Validation accuracy: 0.551


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['predicted'] = mcl
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['predicted'] = mcl


In [57]:
import random
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

def RandomForest(training_data, validation_data):
    # vectorize the training data
    vectorizer = CountVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
    X_train_vect = vectorizer.fit_transform(training_data['content'].values.astype('U'))
    y_train = training_data['content'].values
    
    # vectorize the validation data
    X_val_vect = vectorizer.transform(validation_data['content'].values.astype('U'))
    y_val = validation_data['content'].values

    # initialize and fit the random forest classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_vect.toarray(), y_train)
    
    # make predictions and calculate accuracy
    y_pred = rf.predict(X_val_vect.toarray())
    accuracy = (y_pred == y_val).mean()

    return {'model': rf, 'vectorizer': vectorizer, 'accuracy': accuracy}



# split the data into non-null rows
training_data = train_data.dropna(subset=['content'])
validation_data = val_data.dropna(subset=['content'])

# check that we have at least one non-null row in each set
if training_data.shape[0] == 0:
    raise ValueError("Training data is empty!")
if validation_data.shape[0] == 0:
    print("Warning: Validation data is empty. Setting predictions to empty list.")
    data = []
else:
    # run the model
    data = RandomForest(training_data, validation_data)
    
print(data)



MemoryError: 