-
Notifications
You must be signed in to change notification settings - Fork 1
/
thread.py
60 lines (42 loc) · 1.2 KB
/
thread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 2 21:29:29 2019
@author: innerm
"""
import json
import pandas as pd
import re
def clear_title(data):
data = re.sub('\S*@\S*\s?',' ', data)
data = re.sub('\s+', ' ', data)
data = re.sub("\'", " ", data)
return data
file_en='stage41.csv'
file_ru='stage42.csv'
df=pd.DataFrame()
df1=pd.read_csv(file_en)
df2=pd.read_csv(file_ru)
df=df.append(df1)
df=df.append(df1)
del df1,df2
df=df[df.real_news==1]
categories=[(1,'society'),(2,'economy'),(3,'technology'),(4,'entertainment'),(5,'science'),(6,'sport'),(7,'others')]
for item in categories:
df1=df[df.theme==item[0]]
df1=df1.sort_values(by=['rate_thread'],ascending=False)
df1=df1[df1['rate_thread']>3]
th=df1.thread.tolist()
dt = {i:th.count(i) for i in th}
exp='category:'+item[1]
for item2 in dt.keys():
dd=df1[df1['thread']==item2]
dd=dd.sort_values(by=['prob'],ascending=False)
tt=dd.title.tolist()[0]
tt=clear_title(tt)
files=dd.files.tolist()
edict={'articles':files}
exp2='thread:'+tt
n={exp2:edict}
m={exp:n}
print(json.dumps(m, sort_keys=True,indent=4))