In [1]:
import pandas as pd
import numpy as np

from cleantext import clean
import re
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import matplotlib.pyplot as plt 
import seaborn as sns

import tensorflow as tf
from keras.preprocessing.text import text_to_word_sequence

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import ComplementNB, MultinomialNB



from wordcloud import WordCloud

import joblib 




2023-08-18 19:27:43.760406: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dfs = pd.read_csv("news_cleaned_2018_02_13.csv", chunksize=3000,usecols=["type", "content"],skiprows=lambda x: x in range(1, 100000),lineterminator='\n')

In [3]:
def clean_file(df:pd.DataFrame) -> (pd.DataFrame):
    """
        Takes a TextFileReader with dataframes, and cleans the content. 
        
        Returns the new TextFileReader with cleaned content, and a concatenated string of all the contents. 
    """
    
    i = 0
    for text in df["content"]:
        if type(text) == str:
            date_reg = re.compile("\d{0,4}-\d{0,2}-\d{0,2}[^,]+") # Cleaning the Dates of the text
            date_subbed = date_reg.sub("<DAT>", text)
            date_reg2 = re.compile("/^(?:\d{4})-(?:\d{2})-(?:\d{2})T(?:\d{2}):(?:\d{2}):(?:\d{2}(?:\.\d*)?)(?:(?:-(?:\d{2}):(?:\d{2})|Z)?)$/")
            date_subbed2 = date_reg2.sub("<DAT>", date_subbed)
            cleaned_news_file = clean(date_subbed2,no_line_breaks=True, # Cleaning the rest of the text.
                    no_urls=True,                  
                    no_emails=True,                   
                    no_numbers=True,
                    no_punct=False,
                    replace_with_number = "<NUM>")
            df.loc[i,"content"] = cleaned_news_file # returning the cleaned text to the right position in our dataframe
               
        else:
            pass 
        i += 1
    return df

In [4]:
def tokenize(text):
    """
    Takes a string of text and tokenizes it. 
    
    Returns a list of the tokenized text. 
    
    """
    token_list = text_to_word_sequence(text)
    words = [word for word in token_list if word.isalpha()]
    return words 
    


In [5]:
def removing_stopwords(token_list:list) -> list:
    """
    Takes a list of words and removes the stopwords 
    
    Returns a list of words
    """
    stop_words = set(stopwords.words('english'))
    words = [w for w in token_list if not w in stop_words]
    return words

In [6]:
def stemming(token_list:list) -> list:
    """
    Takes a list of words and stems the words
    
    Returns a list of stemmed words. 
    """
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in token_list]
    return stemmed 

In [7]:
a = ['rumor', 'hate', 'unreliable', 'conspiracy', 'clickbait', 'satire',
       'fake', 'reliable', 'bias', 'political', 'junksci']

In [8]:
def drop_rows(df):
    df = df[df["type"].isin(a)]
    df = df.drop_duplicates("content")
    
    return df

In [9]:
def insert_labels(df):
    df_labels = df["type"].isin(["political", "reliable" , "clickbait"])
    df.insert(loc = len(df.columns) , column = "label" ,value = df_labels)

In [10]:
def final_clean(df):
    df["content"] = df["content"].apply(tokenize)
    df["content"] = df["content"].apply(removing_stopwords)
    df["content"] = df["content"].apply(stemming)

    df["content"] = df["content"].apply(" ".join)

### Since we are using chunks for learning we have to use the hashing vectorizer. If we didn't our vocabulary would increase pr chunk, and our therefor our feature space would increase, which means our training wouldn't work. 

In [11]:
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18, ngram_range=(1,3), alternate_sign=False,)

In [12]:
SGD_cls = SGDClassifier(loss="hinge",random_state = 0)
MNnb = MultinomialNB(fit_prior=False)
Cnb = ComplementNB(fit_prior=False)
MNnb_fp = MultinomialNB()
Cnb_fp = ComplementNB()

In [13]:
df2 = pd.read_csv("news_cleaned_2018_02_13.csv", nrows=5000, skiprows=lambda x: x in range(1, 10000),usecols=["type", "content"])

In [14]:
df2 = drop_rows(df2)
df2 = clean_file(df2)
insert_labels(df2)
final_clean(df2)

classes = df2["label"].unique()

In [15]:
X_test = vectorizer.fit_transform(df2["content"])
y_test = df2["label"]

In [16]:
total_rows_analyzed = 0

### Main loop 

In [None]:
for i, df in enumerate(dfs):
    if i > 720:
        break 
        
    df = clean_file(df)
    df = drop_rows(df)
    insert_labels(df)
    final_clean(df)
    
    
    if len(df["content"]) > 100:
        X_train = vectorizer.fit_transform(df["content"])
        SGD_cls.partial_fit(X_train, df["label"], classes=classes)
        
        MNnb.partial_fit(X_train, df["label"], classes=classes)
        
        Cnb.partial_fit(X_train, df["label"], classes=classes)
        
        MNnb_fp.partial_fit(X_train, df["label"], classes=classes)
        
        Cnb_fp.partial_fit(X_train, df["label"], classes=classes)
        
        total_rows_analyzed += len(df["content"])
    if i%10 == 0:     
        sgd_int = SGD_cls.predict(X_test)
        
        print(f"iteration:{i} SGD Accuracy: {accuracy_score(y_test, sgd_int)} f1_score: {f1_score(y_test, sgd_int)}")
        
        proba_MNnb = MNnb.predict_proba(X_test)
        MNnb_int = proba_MNnb[:,1] > 0.5
        print(f"iteration:{i} MMnb Accuracy: {accuracy_score(y_test, MNnb_int)} f1_score: {f1_score(y_test, MNnb_int)}")
        
        proba_Cnb = Cnb.predict_proba(X_test)
        Cnb_int = proba_Cnb[:,1] > 0.5
        print(f"iteration:{i} Cnb Accuracy: {accuracy_score(y_test, Cnb_int)} f1_score: {f1_score(y_test, Cnb_int)}")
        
        proba_Cnb_fp = Cnb_fp.predict_proba(X_test)
        Cnb_fp_int = proba_Cnb_fp[:,1] > 0.5
        print(f"iteration:{i} Cnb_fp Accuracy: {accuracy_score(y_test, Cnb_fp_int)} f1_score: {f1_score(y_test, Cnb_fp_int)}")
        
        proba_MNnb_fp = MNnb_fp.predict_proba(X_test)
        MNnb_fp_int = proba_MNnb_fp[:,1] > 0.5
        print(f"iteration:{i} MNnb_fp Accuracy: {accuracy_score(y_test, MNnb_fp_int)} f1_score: {f1_score(y_test, MNnb_fp_int)}")
    else:
        pass 
    
    
    
    
    
    
    
    
    
    

iteration:0 SGD Accuracy: 0.3104707792207792 f1_score: 0.45246535610699323
iteration:0 MMnb Accuracy: 0.296875 f1_score: 0.4578313253012048
iteration:0 Cnb Accuracy: 0.296875 f1_score: 0.4578313253012048
iteration:0 Cnb_fp Accuracy: 0.296875 f1_score: 0.4578313253012048
iteration:0 MNnb_fp Accuracy: 0.296875 f1_score: 0.4578313253012048
iteration:10 SGD Accuracy: 0.296875 f1_score: 0.4578313253012048
iteration:10 MMnb Accuracy: 0.29910714285714285 f1_score: 0.4584509250548761
iteration:10 Cnb Accuracy: 0.29910714285714285 f1_score: 0.4584509250548761
iteration:10 Cnb_fp Accuracy: 0.29910714285714285 f1_score: 0.4584509250548761
iteration:10 MNnb_fp Accuracy: 0.296875 f1_score: 0.4578313253012048
iteration:20 SGD Accuracy: 0.3096590909090909 f1_score: 0.4532304725168756
iteration:20 MMnb Accuracy: 0.36363636363636365 f1_score: 0.45706371191135736
iteration:20 Cnb Accuracy: 0.36363636363636365 f1_score: 0.45706371191135736
iteration:20 Cnb_fp Accuracy: 0.36363636363636365 f1_score: 0.457

### Dumping the models to disk 

In [None]:
joblib.dumpmp(SGD_cls, "SGD Classifier")
joblib.dumpmp(MNnb, "MNnb Classifier")
joblib.dumpmp(Cnb, "Cnb Classifier")
joblib.dumpmp(MNnb_fp, "Nnb_fp Classifier")
joblib.dumpmp(SGD_cls, "Cnb_fp Classifier")

In [None]:
total_rows_analyzed