In [2]:
import re
import json
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import collections

In [200]:
url='https://sebenarnya.my/anggota-pdrm-terlibat-dalam-rusuhan-kuil-sri-maha-mariamman-subang-jaya/'
page = requests.get(url)

In [201]:
soup = BeautifulSoup(page.text, 'html.parser')
x = soup.find('div',{'class':'td-post-content'})
cleanLine = [line.strip() for line in x.text.split('\n') if line.strip() != '']
cleanLine

['PALSU:',
 'Tular di media sosial satu klip video dan gambar yang mendakwa kononnya seorang anggota PDRM terlibat dalam rusuhan kuil Sri Maha Mariamman, Subang Jaya.',
 'Sumber gambar: Facebook',
 'SEBENARNYA:',
 'Ketua Polis Negara, Tan Sri Dato Sri Mohamad Fuzi Harun menafikan penglibatan anggotanya dalam rusuhan Kuil Sri Maha Mariaman, Seafield, pada pagi Isnin di USJ 25, Putra Heights, Subang Jaya.',
 'Beliau menjelaskan video dan gambar individu yang tular di media sosial berpakaian uniform polis yang dikaitkan dengan rusuhan kuil baru-baru ini adalah tidak benar.',
 '“Itu salah, dia bukan polis, kami dah buat semakan dan hasil, individu terbabit bukan anggota polis, dia hanyalah polis sukarelawan simpanan atau PVR yang telah lama dibuang perkhidmatan.”',
 'SUMBER:',
 'Facebook Rasmi PDRM',
 'MAINKAN PERANAN ANDA!',
 'Sekiranya anda mempunyai maklumat mengenai berita tidak ditentusah yang melibatkan kepentingan awam mahupun negara,',
 'Salurkan Kepada Kami',
 'TIDAK PASTI JANGAN 

In [58]:
#with audio
https://sebenarnya.my/dakwaan-nota-suara-anggota-atm-pukul-individu-yang-keluar-rumah-ketika-pkp-adalah-palsu/
    
#with caption
https://sebenarnya.my/kilang-assb-di-taman-perindustrian-malaysia-china-kuantan-mckip-terbakar/

786

In [184]:
class Processing:
    def __init__(self, db_dict:dict):
        '''take input of dict retrieved from mongoDB
        '''
        self.raw = db_dict 
        self.date = db_dict['date']
        self.url =  db_dict['url']
        self.title = self.raw['title']
        self.category = 'COVID-19'
        self.content_text, self.content_lines, self.fact_src, self.content_html = self.parse_content()
        self.label_map = self.get_label_map()
        self.label, self.confidence = self.get_label_n_confidence()
        self.audios, self.images = self.get_figures()
        self.json_file = self.to_json()
        
    def get_label_map(self):
        label_map = {
            '1' : ['tidak benar(:|.|$)', 'palsu(:|.|$)'],
            '2' : ['^waspada'],
            '3' : ['penjelasan(:|.|$)', '^makluman'],
        }
        return label_map
        
        
    def parse_content(self):
        content = self.raw['content_html']
        soup = BeautifulSoup(content[0], 'html.parser')
        rm_index = (soup.text.find(soup.find('div',{'class':'awac-wrapper'}).text) 
                    if soup.find('div',{'class':'awac-wrapper'}) else len(soup.text))
        all_text = soup.text[:rm_index]
        lines = [line.strip() for line in all_text.split('\n') if line.strip() != '']

        r = re.compile("[A-Z]*:$")
        keys = list(filter(r.match, lines))
        keys_gen = iter(keys)
        text_dict = {}
        old_key_index = 0
        for i, key in enumerate(keys):
            try:
                new_key_index = lines.index(next(keys_gen))
                if i == 0:
                    if new_key_index == 0:
                        try:
                            new_key_index = lines.index(next(keys_gen))
                            text_dict[key] = '\n'.join(lines[1:new_key_index])
                        except Exception as e:
                            pass
                    else:
                        text_dict['free_text'] = '\n'.join(lines[:new_key_index])
                else:
                    text_dict[key] = '\n'.join(lines[old_key_index+1:new_key_index])
            except Exception as e:
                pass
            old_key_index = new_key_index
        if re.match(re.compile('^sumber:?'), key.lower()):
            fact_src = []
            for line in lines[old_key_index+1:]:
                fact_src.append({
                    'text' : line,
                    'link' : ('' if soup.find('a', href=True, text=line) is None
                              else soup.find('a', href=True, text=line)['href'])
                })
        else:
            text_dict[key] = '\n'.join(lines[old_key_index+1:])

        return text_dict, lines, fact_src, str(soup)
    
    def get_label_n_confidence(self):
        keyword_found = []
        for key in self.label_map.keys():
            for regex in self.label_map[key]:
                if re.search(regex, self.title.lower()):
                    keyword_found.append(key)
                for line in self.content_lines:
                    if re.search(regex, line.lower()):
                        keyword_found.append(key)

        if len(keyword_found)==0:
            label = 1    # default label is 1
            confidence = 3 # if nothing is found give lowest confidence
        else:
            counter = collections.Counter(keyword_found)
            label = int(counter.most_common(1)[0][0])
            confidence = 1 if len(np.unique(keyword_found))==1 else 2
        return (label, confidence)
    
    def get_figures(self):
        audios = []
        images = []
        for figure in soup.find_all('figure'):
            if figure.find('audio') is not None:
                audios.append({
                    'src' : figure.find('audio').get('src'),
                    'caption' : [] if figure.find('figcaption') is None else figure.find('figcaption')
                })
            if figure.find('img') is not None:
                images.append({
                    'src' : figure.find('img').get('src'),
                    'caption' : [] if figure.find('figcaption') is None else figure.find('figcaption')
                })
        return audios, images
    
    def to_json(self):
        json_file = dict(
            date = self.date,
            category = self.category,
            url = self.url,
            title = self.title,
            content_text = self.content_text,
            images = self.images,
            audios = self.audios,
            fact_src = self.fact_src,
            label = self.label,
            confidence = self.confidence,
            content_html = self.content_html,
        )
        return json_file

In [6]:
import pymongo
import json
from Processing.Processing import Processing

client = pymongo.MongoClient()
db = client["news"]
print(client['news'].list_collection_names())
coll_raw = db['sebenarnya_v2_test1']
coll_processed = db['sebenarnya_v2_proccessed1']

['sebenarnya_v2_proccessed2', 'sebenarnya_v2_proccessed1', 'sebenarnya_v2_test1', 'sebenarnya_v1_test2']


In [7]:
text_dict={}
len(text_dict) >0

False

In [7]:
a = 'asdsadsa:'
a.strip(':')

'asdsadsa'

In [None]:
# coll_processed.delete_many({})
list(coll_processed.find())[111]


In [8]:
for raw_dict in list(coll_raw.find())[1:]:
    cls = Processing(raw_dict)
    coll_processed.insert_one(cls.json_file)

In [3]:
x = coll_raw.find_one({'title':'4 Nota Suara Yang Menggunakan Nama MKN Dakwa Darurat Akan Diisytihar Adalah Palsu'})
x['url']

'https://sebenarnya.my/4-nota-suara-yang-menggunakan-nama-mkn-dakwa-darurat-akan-diisytihar-adalah-palsu/'

In [206]:
type(db_dict)

dict

In [7]:
db_dict = raw_dict

title = db_dict['title']
content = db_dict['content_html']
soup = BeautifulSoup(content[0], 'html.parser')
rm_index = soup.text.find(soup.find('div',{'class':'awac-wrapper'}).text) if soup.find('div',{'class':'awac-wrapper'}) else len(soup.text)
all_text = soup.text[:rm_index]
lines = [line.strip() for line in all_text.split('\n') if line.strip() != '']

In [10]:
cls = Processing(db_dict)

['SEBENARNYA:', 'SUMBER:'] 4 Nota Suara Yang Menggunakan Nama MKN Dakwa Darurat Akan Diisytihar Adalah Palsu


In [12]:
cls.title

'4 Nota Suara Yang Menggunakan Nama MKN Dakwa Darurat Akan Diisytihar Adalah Palsu'

In [10]:
' '.join(all_text.split('\n')).strip()

'SEBENARNYA: Jabatan Kesihatan Negeri Terengganu ingin merujuk kepada satu kenyataan yang tular di aplikasi WhatsApp berkenaan kononnya terdapat 30 kes COVID-19 di Durian Burung, Kuala Terengganu berpunca daripada majlis tahlil kematian dan 30 orang ahli keluarga telah dikuarantin di Chendering. Pihak jabatan ini menafikan sepenuhnya kenyataan tersebut. Sehingga 3 April 2020, tiada kes COVID-19 yang dikesan di Kg Durian Burung, Kuala Terengganu. SUMBER: Facebook Rasmi JKNT'