
# 시간정보로 .srt 자막을 만들어보자


## 데이터 준비

In [48]:
import pandas as pd

In [49]:
data = pd.read_csv('time_data.csv', header=None, engine='python')
data.columns = ["log"]

print(len(data))  # 전체 데이터 개수 보기
print(type(data.log[0]))  # 데이터 타입 보기 
data.head()  # 데이터 대략 보기

1372
<class 'numpy.int64'>


Unnamed: 0,log
0,20180814163538872
1,20180814163538901
2,20180814163539070
3,20180814163539091
4,20180814163541127


## timestamp type 으로 변경후 시간 연산 작업

- 시간 연산을 다루기 위해 data type 을 변경한 다음
- 실제 동영상의 재생시간과 맞추는 작업을 한다. 

#### numpy.int64 타입을 pandas 의 Timestamp 타입으로 변경

- str 타입도 같은 방법으로 변경할 수 있다. 


In [50]:
data['time'] = pd.to_datetime(data.log, format='%Y%m%d%H%M%S%f') 
data['start_time'] = data.time

print(type(data.log[0]))
print(type(data.start_time[0]))

data.head()

<class 'numpy.int64'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Unnamed: 0,log,time,start_time
0,20180814163538872,2018-08-14 16:35:38.872,2018-08-14 16:35:38.872
1,20180814163538901,2018-08-14 16:35:38.901,2018-08-14 16:35:38.901
2,20180814163539070,2018-08-14 16:35:39.070,2018-08-14 16:35:39.070
3,20180814163539091,2018-08-14 16:35:39.091,2018-08-14 16:35:39.091
4,20180814163541127,2018-08-14 16:35:41.127,2018-08-14 16:35:41.127


#### 동영상의 재생시간과 동기화를 맞추기

1) start_time 컬럼의 모든 값을 start_time[0] 만큼 빼준다. 첫 이벤트 자막은 동영상이 시작하자 마자 뜨게 된다.
2) 동영상이 재생되고 첫 이벤트가 발생하는 시점만큼 시간을 더해주어서 싱크를 맞춘다.
  - 실제로는 잘 맞지 않기 때문에 이 부분을 미세조정 해준다. 

In [51]:
import datetime

data.start_time = data.start_time - data.start_time[0]
data.start_time = data.start_time + datetime.timedelta(minutes=0, seconds=4.5)
data.head()

Unnamed: 0,log,time,start_time
0,20180814163538872,2018-08-14 16:35:38.872,00:00:04.500000
1,20180814163538901,2018-08-14 16:35:38.901,00:00:04.529000
2,20180814163539070,2018-08-14 16:35:39.070,00:00:04.698000
3,20180814163539091,2018-08-14 16:35:39.091,00:00:04.719000
4,20180814163541127,2018-08-14 16:35:41.127,00:00:06.755000


## 자막 종료시간 만들기

1) shift() 를 이용하여 end_time 생성

2) 맨 아래쪽 공백을 채워줌


In [52]:
data["end_time"] = data.start_time.shift(-1)
data.end_time.iloc[-1] = data.start_time.iloc[-1] + datetime.timedelta(seconds=3)
data.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,log,time,start_time,end_time
1367,20180814170941351,2018-08-14 17:09:41.351,00:34:06.979000,00:34:07.510000
1368,20180814170941882,2018-08-14 17:09:41.882,00:34:07.510000,00:34:07.543000
1369,20180814170941915,2018-08-14 17:09:41.915,00:34:07.543000,00:34:11.151000
1370,20180814170945523,2018-08-14 17:09:45.523,00:34:11.151000,00:34:11.167000
1371,20180814170945539,2018-08-14 17:09:45.539,00:34:11.167000,00:34:14.167000


## 자막 시퀀스 넘버 만들기

In [53]:
data["seq_num"] = data.index.astype(int) + 1
data.seq_num = data.seq_num.astype('str')
data.head()

Unnamed: 0,log,time,start_time,end_time,seq_num
0,20180814163538872,2018-08-14 16:35:38.872,00:00:04.500000,00:00:04.529000,1
1,20180814163538901,2018-08-14 16:35:38.901,00:00:04.529000,00:00:04.698000,2
2,20180814163539070,2018-08-14 16:35:39.070,00:00:04.698000,00:00:04.719000,3
3,20180814163539091,2018-08-14 16:35:39.091,00:00:04.719000,00:00:06.755000,4
4,20180814163541127,2018-08-14 16:35:41.127,00:00:06.755000,00:00:06.789000,5


## 자막 시간정보 만들기

- 우선은 시간정보를 string 타입으로 변환한다
- 참고링크: https://stackoverflow.com/questions/50652295/i-want-to-convert-pandas-timedelta-to-string-with-format

In [54]:
data['time'] = pd.to_datetime(data['time']).dt.strftime("%H:%M:%S.%f").str[:-3] 
data['start_time'] = pd.to_datetime(data['start_time']).dt.strftime("%H:%M:%S.%f").str[:-3] 
data['end_time'] = pd.to_datetime(data['end_time']).dt.strftime("%H:%M:%S.%f").str[:-3] 

TypeError: dtype timedelta64[ns] cannot be converted to datetime64[ns]

- 그리고 SRT 의 시간포맷에 맞게 바꿔준다. (초와 밀리초 사이에 콤마가 들어감에 주의)

In [36]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
# make start string
data['srt_time'] = data['start_time'].str[:8] + "," +  data['start_time'].str[9:]+ " --> "
# make stop string
data['srt_time'] = data['srt_time'] + data['end_time'].str[:8] + "," + data['end_time'].str[9:]
data.head()

AttributeError: Can only use .str accessor with string values!

## 자막 만들기

- 자막의 구성은 다음과 같다.



1.   시퀀스 넘버
2.   자막 시작시간과 종료시간
3.   실제 자막: Event +seq_num + 실제 발생시간 


In [None]:
data["srt"] = data['seq_num'].str[:] + "\n" \
                    + data['srt_time'].str[:] + "\n" \
                    + "Event: " \
                    + data['seq_num'].str[:] \
                    + "(" + data['time'].str[:] + ")" \
                    + "\n\n"

In [11]:
pd.set_option('max_colwidth', 100)
data.head()

Unnamed: 0,log,time,start_time,end_time,seq_num,srt_time,srt
0,20180814163538872,16:35:38.872,00:00:04.500,00:00:04.529,1,"00:00:04,500 --> 00:00:04,529","1\n00:00:04,500 --> 00:00:04,529\nEvent: 1(16:35:38.872)\n\n"
1,20180814163538901,16:35:38.901,00:00:04.529,00:00:04.698,2,"00:00:04,529 --> 00:00:04,698","2\n00:00:04,529 --> 00:00:04,698\nEvent: 2(16:35:38.901)\n\n"
2,20180814163539070,16:35:39.070,00:00:04.698,00:00:04.719,3,"00:00:04,698 --> 00:00:04,719","3\n00:00:04,698 --> 00:00:04,719\nEvent: 3(16:35:39.070)\n\n"
3,20180814163539091,16:35:39.091,00:00:04.719,00:00:06.755,4,"00:00:04,719 --> 00:00:06,755","4\n00:00:04,719 --> 00:00:06,755\nEvent: 4(16:35:39.091)\n\n"
4,20180814163541127,16:35:41.127,00:00:06.755,00:00:06.789,5,"00:00:06,755 --> 00:00:06,789","5\n00:00:06,755 --> 00:00:06,789\nEvent: 5(16:35:41.127)\n\n"


## 자막 저장하기

- 이제 자막 저장만이 남음

1. 먼저 srt 컬럼의 정보만을 추려낸 DataFrame 을 만들고
2. srt 파일로 저장한다음
3. 불필요하게 생성된 쌍따옴표를 제거하면 된다. 

In [12]:
srt = data.srt.copy()
srt.head()

0    1\n00:00:04,500 --> 00:00:04,529\nEvent: 1(16:35:38.872)\n\n
1    2\n00:00:04,529 --> 00:00:04,698\nEvent: 2(16:35:38.901)\n\n
2    3\n00:00:04,698 --> 00:00:04,719\nEvent: 3(16:35:39.070)\n\n
3    4\n00:00:04,719 --> 00:00:06,755\nEvent: 4(16:35:39.091)\n\n
4    5\n00:00:06,755 --> 00:00:06,789\nEvent: 5(16:35:41.127)\n\n
Name: srt, dtype: object

In [None]:
srt.to_csv('subtitle.srt', header=None, mode = 'w', index=False)

In [15]:
!head subtitle.srt

"1
00:00:04,500 --> 00:00:04,529
Event: 1(16:35:38.872)

"
"2
00:00:04,529 --> 00:00:04,698
Event: 2(16:35:38.901)

"


In [16]:
# Read in the file
with open('subtitle.srt', 'r') as file :
  filedata = file.read()

# Replace the target string
filedata = filedata.replace('"', '')

# Write the file out again
with open('subtitle.srt', 'w') as file:
  file.write(filedata)
  
!head subtitle.srt
  


1
00:00:04,500 --> 00:00:04,529
Event: 1(16:35:38.872)


2
00:00:04,529 --> 00:00:04,698
Event: 2(16:35:38.901)




## SRT 파일을 Colaboratory 에서 다운로드 받자

In [None]:
from google.colab import files
files.download('subtitle.srt') 