-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathfile_util.py
179 lines (149 loc) · 5.02 KB
/
file_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Description: 文件相关处理
@Date :2022/01/22
@Author :xhunmon
@Mail :xhunmon@gmail.com
"""
import datetime
import json
import os
import re
import shutil
import cairosvg
import pandas as pd
import pypandoc # 要安装pandoc
from docx import Document
def file_name(file_dir):
results = []
for root, dirs, files in os.walk(file_dir):
# print(root) # 当前目录路径
# print(dirs) # 当前路径下所有子目录
# print(files) # 当前路径下所有非目录子文件
results += files
return results
def deal_one_page():
fs = file_name('100条')
for f in fs:
try:
print('正在检测【%s】' % f)
shotname, extension = os.path.splitext('%s' % f)
print('正在检测【%s】' % shotname)
if '1篇' in shotname:
new_name = re.sub(r'1篇', '', f)
document = Document(r"html/%s" % f)
paragraphs = document.paragraphs
p = paragraphs[0]
p._element.getparent().remove(p._element)
document.save(r"html/%s" % new_name)
os.remove('html/%s' % f)
except Exception as e:
print(e)
def copy_doc():
fs = file_name('all')
i = 1
k = 1
temp_dir = '01'
os.makedirs('100条/%s' % temp_dir)
for f in fs:
try:
# print('正在检测【%s】' % f)
shotname, extension = os.path.splitext('%s' % f)
shutil.copyfile(r'all/%s' % f, r'100条/%s/%s' % (temp_dir, f))
if i % 100 == 0:
temp_dir = '0%d' % k if k < 10 else '%d' % k
k += 1
os.makedirs('100条/%s' % temp_dir)
i += 1
except Exception as e:
print(e)
'''########文件处理相关#########'''
def html_cover_doc(in_path, out_path):
'''将html转化成功doc'''
path, file_name = os.path.split(out_path)
if path and not os.path.exists(path):
os.makedirs(path)
pypandoc.convert_file(in_path, 'docx', outputfile=out_path)
def svg_cover_jpg(src, dst):
''''
drawing = svg2rlg("drawing.svg")
renderPDF.drawToFile(drawing, "drawing.pdf")
renderPM.drawToFile(drawing, "fdrawing.png", fmt="PNG")
renderPM.drawToFile(drawing, "drawing.jpg", fmt="JPG")
'''
path, file_name = os.path.split(dst)
if path and not os.path.exists(path):
os.makedirs(path)
# drawing = svg2rlg(src)
# renderPM.drawToFile(drawing, dst, fmt="JPG")
cairosvg.svg2png(url=src, write_to=dst)
def html_cover_excel(content, out_path):
'''将html转化成excel'''
path, file_name = os.path.split(out_path)
if path and not os.path.exists(path):
os.makedirs(path)
tables = pd.read_html(content, encoding='utf-8')
writer = pd.ExcelWriter(out_path)
for i in range(len(tables)):
tables[i].to_excel(writer, sheet_name='表%d' % (i + 1)) # startrow
writer.save() # 写入硬盘
def write_to_html(content, file_path):
'''将内容写入本地,自动加上head等信息'''
page = '''<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head>
<body>'''
page += content
page += '''</body>
</html>'''
write(page, file_path)
def write_json(content, file_path):
'''写入json'''
path, file_name = os.path.split(file_path)
if path and not os.path.exists(path):
os.makedirs(path)
with open(file_path, 'w') as f:
json.dump(content, f, ensure_ascii=False)
f.close()
def read_json(file_path):
'''读取json'''
with open(file_path, 'r') as f:
js_get = json.load(f)
f.close()
return js_get
def write(content, file_path):
'''写入txt文本内容'''
path, file_name = os.path.split(file_path)
if path and not os.path.exists(path):
os.makedirs(path)
with open(file_path, 'w') as f:
f.write(content)
f.close()
def read(file_path) -> str:
'''读取txt文本内容'''
content = None
try:
with open(file_path, 'r') as f:
content = f.read()
f.close()
except Exception as e:
print(e)
return content
def get_next_folder(dst, day_diff, folder, max_size):
'''遍历目录文件,直到文件夹不存在或者数目达到最大(max_size)时,返回路径'''
while True:
day_time = (datetime.date.today() + datetime.timedelta(days=day_diff)).strftime('%Y-%m-%d') # 下一天的目录继续遍历
folder_path = os.path.join(dst, day_time, folder)
if os.path.exists(folder_path): # 已存在目录
size = len(next(os.walk(folder_path))[2])
if size >= max_size: # 该下一个目录了
day_diff += 1
continue
else:
os.makedirs(folder_path)
return day_diff, folder_path
if __name__ == '__main__':
pass