-
Notifications
You must be signed in to change notification settings - Fork 1
/
top.py
66 lines (56 loc) · 1.62 KB
/
top.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 2 22:26:54 2019
@author: innerm
"""
import json
import pandas as pd
import re
def clear_title(data):
data = re.sub('\S*@\S*\s?',' ', data)
data = re.sub('\s+', ' ', data)
data = re.sub("\'", " ", data)
return data
file_en='stage41.csv'
file_ru='stage42.csv'
df=pd.DataFrame()
df1=pd.read_csv(file_en)
df2=pd.read_csv(file_ru)
df=df.append(df1)
df=df.append(df1)
del df1,df2
df=df[df.real_news==1]
categories=[(1,'society'),(2,'economy'),(3,'technology'),(4,'entertainment'),(5,'science'),(6,'sport'),(7,'others')]
df=df.sort_values(by=['rate_thread'],ascending=False)
df=df[df['rate_thread']>3]
th=df.thread.tolist()
dt = {i:th.count(i) for i in th}
for item in dt.keys():
dd=df[df['thread']==item]
dd=dd.sort_values(by=['prob'],ascending=False)
tt=dd.title.tolist()[0]
tt=clear_title(tt)
files=dd.files.tolist()
edict={'articles':files}
exp2='thread:'+tt
n={exp2:edict}
print(json.dumps(n, sort_keys=True,indent=3))
for item in categories:
df1=df[df.theme==item[0]]
df1=df1.sort_values(by=['rate_thread'],ascending=False)
df1=df1[df1['rate_thread']>3]
th=df1.thread.tolist()
dt = {i:th.count(i) for i in th}
exp='category:'+item[1]
for item2 in dt.keys():
dd=df1[df1['thread']==item2]
dd=dd.sort_values(by=['prob'],ascending=False)
tt=dd.title.tolist()[0]
tt=clear_title(tt)
files=dd.files.tolist()
edict={'articles':files}
exp2='thread:'+tt
n={exp2:edict}
m={exp:n}
print(json.dumps(m, sort_keys=True,indent=4))