***IMPORTANT: COLAB AUTH CODE HERE***

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount = True)

Mounted at /content/drive


# ***processing components***

In [None]:
!pip install kaggle
import json

# auth API using kaggle.json
with open('drive/MyDrive/kaggle.json') as json_file:
    uploaded = json.load(json_file)

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
!mkdir -p ~/.kaggle/ && cp drive/MyDrive/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

User uploaded file "username" with length 6 bytes
User uploaded file "key" with length 32 bytes


In [None]:
# downloading dataset using API

!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Downloading fake-and-real-news-dataset.zip to /content
 66% 27.0M/41.0M [00:00<00:00, 33.0MB/s]
100% 41.0M/41.0M [00:00<00:00, 69.8MB/s]


In [None]:
!ls datasets

ls: cannot access 'datasets': No such file or directory


In [None]:
import zipfile

with zipfile.ZipFile("fake-and-real-news-dataset.zip","r") as zip_ref:
  zip_ref.extractall("datasets")

In [None]:
!git clone https://github.com/facebookresearch/fastText.git
%cd fastText
!sudo pip install .
%cd ..

import fasttext

# downloading pre-trained vectors (wikipedia)
# fasttext.util.download_model('en', if_exists='ignore')

Cloning into 'fastText'...
remote: Enumerating objects: 3854, done.[K
remote: Total 3854 (delta 0), reused 0 (delta 0), pack-reused 3854[K
Receiving objects: 100% (3854/3854), 8.23 MiB | 32.15 MiB/s, done.
Resolving deltas: 100% (2416/2416), done.
/content/fastText
Processing /content/fastText
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3090553 sha256=f69ae59414fabe29a449a229431e6f89e68fdeea2ad415db07be3ab95acf3d83
  Stored in directory: /tmp/pip-ephem-wheel-cache-vub3_1pi/wheels/a1/9f/52/696ce6c5c46325e840c76614ee5051458c0df10306987e7443
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2
/content


# ***training main***

## loading vectors (not used)

In [None]:
# ft = fasttext.load_model('cc.en.300.bin')

In [None]:
# ft.get_nearest_neighbors('arrow')

## text preprocessing functions
*modified from Charles Malafosse's code*


In [None]:
# taken from FastText sentiment analysis for text by Charles Malafosse
import re
import itertools
 
def load_dict_contractions():
    return {
        "aren't":"are not",
        "can't":"cannot",
        "couldn't":"could not",
        "could've":"could have",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "everyone's":"everyone is",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "might've":"might have",
        "mustn't":"must not",
        "must've":"must have",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "those're":"those are",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        }
 
def strip_accents(text):
    if 'ø' in text or  'Ø' in text:
        #Do nothing when finding ø 
        return text   
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)
 
def text_cleaning_for_analysis(text):    
 
    #Special case not handled previously.
    text = text.replace('\x92',"'")

    #Removal of address
    text = ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())
 
    #Removal of Punctuation
    # text = ' '.join(re.sub("[\.\,\!\?\:\;\-\=\(\)]", " ", text).split())
    text = re.sub("[\.\,\!\?\:\;\-\=\(\)]", "", text)
 
    #Lower case
    text = text.lower()
 
    #CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
    CONTRACTIONS = load_dict_contractions()
 
    text = text.replace("’","'")
    words = text.split()
    reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
    text = " ".join(reformed)

    # removing overused words
    text = re.sub('reuters|washington|21st century wire says', '', text)

    # Standardizing words
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
 
    # Strip accents
    text = strip_accents(text)
    text = text.replace(":"," ")
    text = ' '.join(text.split())

    # remove double spaces
    text = re.sub(r'\s+', ' ', text)
    
    # DO NOT REMOVE STOP WORDS FOR SENTIMENT ANALYSIS - OR AT LEAST NOT NEGATIVE ONES
 
    return text

In [None]:
def transform_instance(label, text):
  current = []
  labelText = "__label__" + label.lower().split('/')[-1].split('.')[0]
  current.append(labelText)
  current.extend(fasttext.tokenize(text_cleaning_for_analysis(text)))
  return current

In [None]:
import csv
 
def preprocess(input_files, output_file, keep=1):
    i=0
    with open(output_file, 'w') as csvoutfile:
      csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
      for input_file in input_files:
        with open(input_file, 'r', newline='') as csvinfile: #,encoding='latin1'
          csv_reader = csv.reader(csvinfile, delimiter=',', quotechar='"')
          for row in csv_reader:
            if row[1] != 'text' and row[1] != '':
              row_output = transform_instance(input_file, row[1])
              csv_writer.writerow(row_output)
            i += 1
            if i%6000 == 0:
              print(i)

## dataset upsampling and allocating function
*upsampling from Charles Malafosse's code*

In [None]:
def upsamplingAllocating(input_file, output_file, ratio_upsampling=1, validation_count=1000):
    # allocate data points for validation
    # Create files with equal number of tweets for each label
    
    i=0
    counts = {}
    dict_data_by_label = {}

    # GET LABEL LIST AND GET DATA PER LABEL
    with open(input_file, 'r', newline='') as csvinfile: 
        csv_reader = csv.reader(csvinfile, delimiter=',', quotechar='"')
        for row in csv_reader:
            counts[row[0].split()[0]] = counts.get(row[0].split()[0], 0) + 1
            if not row[0].split()[0] in dict_data_by_label:
                dict_data_by_label[row[0].split()[0]]=[row[0]]
            else:
                dict_data_by_label[row[0].split()[0]].append(row[0])
            i=i+1
            if i%10000 ==0:
                print("read" + str(i))

    # FIND MAJORITY CLASS
    majority_class=""
    count_majority_class=0
    for item in dict_data_by_label:
        if len(dict_data_by_label[item])>count_majority_class:
            majority_class= item
            count_majority_class=len(dict_data_by_label[item])

    # allocate validation
    validation_per_label = validation_count//len(counts)
    val_data=[]
    for item in dict_data_by_label:
      val_data.extend(dict_data_by_label[item][:validation_per_label])
      dict_data_by_label[item] = dict_data_by_label[item][validation_per_label:]
      print(f"{item}: {len(dict_data_by_label[item])}")
      print(dict_data_by_label[item][4000])
    
    count_majority_class -= 500
    
    # UPSAMPLE MINORITY CLASS (and split data)
    data_upsampled=[]
    for item in dict_data_by_label:
        data_upsampled.extend(dict_data_by_label[item])
        if item != majority_class:
            items_added=0
            items_to_add = count_majority_class - len(dict_data_by_label[item])
            # print(items_to_add)
            while items_added<items_to_add:
                data_upsampled.extend(dict_data_by_label[item][:max(0,min(items_to_add-items_added,len(dict_data_by_label[item])))])

                #print(max(0,min(items_to_add-items_added,len(dict_data_by_label[item]))))

                items_added += max(0,min(items_to_add-items_added,len(dict_data_by_label[item])))

    # write validation
    i = 0
    with open(output_file.with_suffix(output_file.suffix + ".validation"), 'w') as txtoutfile:
      for row in val_data:
        txtoutfile.write(row+'\n')
        i=i+1
        # if i%500 == 0:
          # print ("validation" + str(i))
    
    # WRITE TRAIN
    i=0
    print(len(data_upsampled))
    with open(output_file.with_suffix(output_file.suffix + ".train"), 'w') as txtoutfile:
        for row in data_upsampled:
            txtoutfile.write(row+ '\n' )
            i=i+1
            if i%10000 ==0:
                print("train" + str(i))
    print(i)

## preprocessing datasets

In [None]:
import glob
from pathlib import Path

data_folder = Path("datasets")
datasets_glob = str(data_folder / "*.csv")

preprocess(glob.glob(str(datasets_glob)), data_folder / "news.data")

6000
12000
18000
24000
30000
36000
42000


In [None]:
upsamplingAllocating(data_folder / "news.data", data_folder / "news")

read10000
read20000
read30000
read40000
__label__true: 20917
__label__true the leader of a group of us house republican conservatives said on monday he expects to see text of a revamped bill to repeal and replace obamacare within 24 hours we're waiting to see what the legislative text actually outlines but we remain open minded and willing to look at the details of the plan representative mark meadows chairman of the house of representatives freedom caucus told reporters after a meeting of the group which helped kill a white housebacked plan last month we're hopeful that we will get the legislative text within the next 24 hours
__label__fake: 22981
__label__fake a louisiana deputy city marshal is in hot water after body cam footage revealed him shooting at a suspect who had his hands up in the air this is sadly becoming too common a story in the news as awareness of problems with officers who have hairtrigger fingers grows what makes this story sadly and horrifically unique though is t

## training model

In [None]:
import datetime
import fasttext
import os
 
def train(training_data_path, validation_data_path):
    print('Training start')
    try:                     
        print(str(datetime.datetime.now()) + ' START=>')
 
        # Train the model.
        model = fasttext.train_supervised(input=training_data_path, autotuneValidationFile=training_data_path, autotuneDuration=6000, autotuneModelSize="2M")
 
        # CHECK PERFORMANCE
        print(str(datetime.datetime.now()) + 'Training complete.')
        
        result = model.test(training_data_path)
        validation = model.test(validation_data_path)
        
        # DISPLAY ACCURACY OF TRAINED MODEL
        text_line = "accuracy:" + str(result[1:])  + ", validation:" + str(validation[1:]) + '\n' 
        print(text_line)
        
        # quantize a model to reduce the memory usage
        print("Model is quantized!")
        return model               
        
    except Exception as e:
        print('Exception during training: ' + str(e) )

In [None]:
# Train your model.
model = train(str(data_folder / "news.train"), str(data_folder / "news.validation"))

Training start
2021-01-05 02:56:47.397607 START=>
2021-01-05 05:13:52.172013Training complete.
accuracy:(0.9871850659240242, 0.9871850659240242), validation:(0.999, 0.999)

Model is quantized!!


In [None]:
print(data_folder / "news.train")
with open(data_folder / "news.train", 'r') as txtoutfile:
  lines = txtoutfile.read().split("\n")
  print(lines[3])

datasets/news.train
__label__true us budget chief mick mulvaney on tuesday told staff at the consumer financial protection bureau to disregard instructions from leandra english the deputy director according to a memo consistent with my email from yesterday please disregard any email sent by or instructions you receive from ms english when she is purporting to act as the acting director mulvaney wrote in an email to staff tuesday morning mulvaney and english the agency's deputy director are in a legal fight over who should control the agency following the friday resignation of director richard cordray


In [None]:
models_folder = Path("drive/MyDrive/fake news models/")
models_folder.mkdir(parents=True, exist_ok=True)

model.save_model(str(models_folder / "news-classifier-09.ftz"))