## Exploratory data analysis
Getting insights from data

### 1. Import library

In [1]:
import pandas as pd
import pymongo
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings("ignore")

### 2. Get data from Mongodb

In [4]:
uri = "mongodb://dreamywanderer:fIheB7sQzEsjH3U6WXmOXoVP1Hj79V4Xom1pNV0uHNbNBal0Lx75X6fwSovFOxXFftvFAMsf5SGoACDboPqXRA==@dreamywanderer.mongo.cosmos.azure.com:10255/?ssl=true&retrywrites=false&replicaSet=globaldb&maxIdleTimeMS=120000&appName=@dreamywanderer@"

client = pymongo.MongoClient(uri)
NewsDataset = client['NewsDataset']

# Get name of all existing collections in the NewsDataset
print(NewsDataset.list_collection_names())

VNFD = NewsDataset['VNFDPreprocessed']

['VNFD', 'VNFDPreprocessed']


### 3. Change format to dataframe

In [5]:
data = {}
columns = list(VNFD.find()[0].keys())
for col in columns:
    data[col] = []

for doc in VNFD.find():
    for col in columns:
        data[col].append(doc[col])   

In [6]:
df = pd.DataFrame(data).drop(['_id'], axis = 1)
df.head()

Unnamed: 0,title,content,url,label,imageURL,domain,topic,author,date
0,Bức vẽ giúp bạn đánh giá mức độ stress của bản...,"[tranh, thể, vòng, tròn, động, chậm, rãi, hình...",https://suckhoe.vnexpress.net/tin-tuc/tu-van/b...,1,https://i-suckhoe.vnecdn.net/2018/11/16/onh-15...,suckhoe.vnexpress.net,,[],2018-11-17 12:15:00
1,KHẨN CẤP: Hàng loạt trẻ em nhập viện vì ngộ độ...,"[xét, nghiệm, ban, đầu, trẻ, nhập, viện, bé, d...",http://phapluat.news/khan-cap-hang-loat-tre-em...,1,http://img-static.phapluat.news/2017/10/H1.jpg,phapluat.news,,[],2017-10-04 14:29:00
2,"Không đủ tiền tiêu hủy, Chi cục Thú y đề nghị ...","[289, ubnd, thành, phố, chi, cục, thú, y, văn,...",http://phapluat.news/khong-du-tien-tieu-huy-ch...,1,http://img-static.phapluat.news/2017/09/h1-4.jpg,phapluat.news,,[],2017-09-28 11:48:06
3,KHẨN CẤP: Xuất hiện xe bắt chó giả ở TP.HCM,"[suốt, lực, săn, bắt, chó, đi, tuần, tra, rộng...",http://autoxe.net/doi-song/khan-cap-xuat-hien-...,1,http://autoxe.net/wp-content/uploads/2017/09/u...,autoxe.net,,[],2017-09-13 15:05:28
4,Đề xuất cấm tất cả công chức Hà Nội đổ xăng tạ...,"[chiều, 1110, ubnd, tphà, nội, công, văn, hiệp...",http://phapluat.news/de-xuat-cam-tat-ca-cong-c...,1,http://img-static.phapluat.news/2017/10/h1-2.jpg,phapluat.news,,[],2017-10-11 16:28:55


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     242 non-null    object
 1   content   242 non-null    object
 2   url       242 non-null    object
 3   label     242 non-null    int64 
 4   imageURL  242 non-null    object
 5   domain    242 non-null    object
 6   topic     0 non-null      object
 7   author    242 non-null    object
 8   date      229 non-null    object
dtypes: int64(1), object(8)
memory usage: 17.1+ KB


#### 3.1 Preprocessing (this section will be delete when we have clean data)

In [8]:
for i in range(df.shape[0]):
    if df['content'][i] == None:
        df = df.drop(labels = i, axis = 0)

df = df.reset_index(drop = True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     242 non-null    object
 1   content   242 non-null    object
 2   url       242 non-null    object
 3   label     242 non-null    int64 
 4   imageURL  242 non-null    object
 5   domain    242 non-null    object
 6   topic     0 non-null      object
 7   author    242 non-null    object
 8   date      229 non-null    object
dtypes: int64(1), object(8)
memory usage: 17.1+ KB


#### 3.2 Safe csv file (optional)

In [12]:
#safe df
#df.to_csv('Dataset/VNFD.csv', index = False)

### 4. Get insights

4.1 Get brief infomation

4.2 Check whether the distribution of classes is different

4.3 Statistical information of content

...

#### 4.x Topic extraction

In [81]:
#Use when the content has not been preprocessed
def retrieve_topic(content):
    tokens = word_tokenize(content.lower())
    filtered_tokens = [token for token in tokens if token.isalpha()]
    dictionary = corpora.Dictionary([filtered_tokens])
    corpus = [dictionary.doc2bow(filtered_tokens)]
    lda_model = LdaModel(corpus, num_topics=1, id2word=dictionary)
    topics = lda_model.show_topic(0)
    topics = [topic[0] for topic in topics[:5]] #get 5 keywords for topic
    return topics

In [13]:
# Use when the content has been preprocessed
def retrieve_topic(content):
    dictionary = corpora.Dictionary([content])
    corpus = [dictionary.doc2bow(content)]
    lda_model = LdaModel(corpus, num_topics=1, id2word=dictionary)
    topics = lda_model.show_topic(0)
    topics = [topic[0] for topic in topics[:5]] #get 5 keywords for topic
    return topics

In [16]:
#df['content'] = df['content'].astype('string') #Use when the content has not been preprocessed
df['topic_extract'] = pd.Series()
for i in range(df.shape[0]):
    df['topic_extract'][i] = retrieve_topic(df['content'][i])

In [17]:
df['topic_extract']

0             [ảnh, hình, động, lý, học]
1           [thuốc, heo, an, tiêm, thần]
2           [chó, hoang, viên, thú, cầm]
3               [chó, bắt, xe, cục, chi]
4       [xăng, doanh, nghiệp, dầu, việt]
                     ...                
237        [hiện, tai, trường, nạn, ảnh]
238            [sở, vi, động, dâm, hàng]
239       [nam, bay, châu, chuyến, việt]
240    [thành, đường, phố, nguyễn, gian]
241          [biển, nẵng, đà, tắm, hằng]
Name: topic_extract, Length: 242, dtype: object