# 데이터, 패키지 로드

In [1]:
# 패키지 로드 
import pandas as pd 
import numpy as np
import datetime

In [2]:
# 데이터 로드 
df_stock= pd.read_csv("../../data/Stock/stock+tech_yg.csv")
df_trend= pd.read_csv("../../data/Trend/trend_yg.csv")
df_sentiment= pd.read_excel("../../data/News/sentiment_yg.xlsx")
df_event= pd.read_excel("../../data/News/event_yg.xlsx")

In [3]:
df_stock.info() # 주가+기술지표 데이터

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    123 non-null    object 
 1   Open    123 non-null    int64  
 2   High    123 non-null    int64  
 3   Low     123 non-null    int64  
 4   Close   123 non-null    int64  
 5   Volume  123 non-null    int64  
 6   Change  123 non-null    float64
 7   MACD    123 non-null    float64
 8   Signal  123 non-null    float64
 9   PSAR    123 non-null    float64
 10  upper   123 non-null    float64
 11  middle  123 non-null    float64
 12  lower   123 non-null    float64
 13  SlowK   123 non-null    float64
 14  SlowD   123 non-null    float64
 15  ROC     123 non-null    float64
 16  OBV     123 non-null    float64
 17  FI      123 non-null    float64
dtypes: float64(12), int64(5), object(1)
memory usage: 17.4+ KB


In [4]:
df_trend.info() # 트렌드(유튜브, 네이버) 데이터

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               181 non-null    object 
 1   keyword            181 non-null    float64
 2   view_log_like_sum  181 non-null    float64
 3   view_log_like_avg  181 non-null    float64
 4   count              181 non-null    float64
 5   trend              181 non-null    float64
dtypes: float64(5), object(1)
memory usage: 8.6+ KB


In [5]:
df_sentiment.info() # 감정지수(뉴스) 변수 데이터

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           126 non-null    datetime64[ns]
 1   SENTIMENT_SUM  126 non-null    float64       
 2   DAY            126 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 3.1+ KB


In [6]:
df_event.info() # 이벤트 변수 데이터

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           17 non-null     datetime64[ns]
 1   SENTIMENT_SUM  17 non-null     float64       
 2   DAY            17 non-null     object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 536.0+ bytes


# DATE 변수타입 통일 

In [7]:
# df_stock의 'Date' 열을 datetime 형식으로 변환
df_stock['Date'] = pd.to_datetime(df_stock['Date'])

# df_trend의 'date' 열을 datetime 형식으로 변환
df_trend['date'] = pd.to_datetime(df_trend['date'])

# DATE 변수명 통일

In [8]:
# df_stock의'Date' 열의 이름을 'DATE'로 변경
df_stock = df_stock.rename(columns={'Date': 'DATE'})

# df_trend의'date' 열의 이름을 'DATE'로 변경
df_trend = df_trend.rename(columns={'date': 'DATE'})

# df_trend 데이터의 DATE 수정 

### 일자별 요일 지정
향후 분석에 용이하도록 DATE_NEW 일자를 기준으로 일자별 요일을 지정해줌.
- 2023년 1월은 일요일부터 시작함

In [9]:
weekday_dict = {0:'월', 1:'화', 2:'수', 3:'목', 4:'금', 5:'토', 6:'일'}

In [10]:
day_list = []

for i in range(len(df_trend)):
    day_list.append(weekday_dict[df_trend["DATE"][i].weekday()])

df_trend["DAY"] = day_list
df_trend.groupby("DAY").size()

DAY
금    26
목    26
수    26
월    26
일    26
토    25
화    26
dtype: int64

In [11]:
#요일 배정이 잘 되었는지 확인
for day in weekday_dict.values():
    print(day, df_trend[df_trend["DAY"] == day]["DATE"].unique())

월 ['2023-01-02T00:00:00.000000000' '2023-01-09T00:00:00.000000000'
 '2023-01-16T00:00:00.000000000' '2023-01-23T00:00:00.000000000'
 '2023-01-30T00:00:00.000000000' '2023-02-06T00:00:00.000000000'
 '2023-02-13T00:00:00.000000000' '2023-02-20T00:00:00.000000000'
 '2023-02-27T00:00:00.000000000' '2023-03-06T00:00:00.000000000'
 '2023-03-13T00:00:00.000000000' '2023-03-20T00:00:00.000000000'
 '2023-03-27T00:00:00.000000000' '2023-04-03T00:00:00.000000000'
 '2023-04-10T00:00:00.000000000' '2023-04-17T00:00:00.000000000'
 '2023-04-24T00:00:00.000000000' '2023-05-01T00:00:00.000000000'
 '2023-05-08T00:00:00.000000000' '2023-05-15T00:00:00.000000000'
 '2023-05-22T00:00:00.000000000' '2023-05-29T00:00:00.000000000'
 '2023-06-05T00:00:00.000000000' '2023-06-12T00:00:00.000000000'
 '2023-06-19T00:00:00.000000000' '2023-06-26T00:00:00.000000000']
화 ['2023-01-03T00:00:00.000000000' '2023-01-10T00:00:00.000000000'
 '2023-01-17T00:00:00.000000000' '2023-01-24T00:00:00.000000000'
 '2023-01-31T00:00:0

- **설명**
- '요일' 변수는 각 뉴스가 영향을 미치는 요일을 의미한다 
- ex1) 요일이 '금'으로 지정된 데이터 : 금요일 주가에 영향을 미치는 데이터 
- ex2) 요일이 '토'으로 지정된 데이터 : 금요일 주가에 영향을 미치는 데이터 


* **토요일, 일요일 데이터 전처리**
- 토요일과 일요일 데이터는 ***월요일*** 주가에 영향을 미친다
- 따라서 요일 변수가 토요일, 일요일로 설정된 데이터들은 그 다음주 월요일 데이터로 간주하여 날짜 및 요일 변수를 변경한다

In [12]:
 for i in range(len(df_trend)):
    if (df_trend['DAY'].iloc[i] == '토'): # 토요일 뉴스기사라면, 
        df_trend['DATE'].iloc[i]+= datetime.timedelta(days=2)  # DATE_NEW(날짜)변수에 이틀을 추가
        df_trend['DAY'].iloc[i] = '월'       #DAY(요일) 변수를 월요일로
    if (df_trend['DAY'].iloc[i] == '일'): # 일요일 뉴스기사라면, 
        df_trend['DATE'].iloc[i]+= datetime.timedelta(days=1)  # DATE_NEW(날짜)변수에 하루를 추가
        df_trend['DAY'].iloc[i] = '월'       #DAY(요일) 변수를 월요일로

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trend['DATE'].iloc[i]+= datetime.timedelta(days=1)  # DATE_NEW(날짜)변수에 하루를 추가
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trend['DAY'].iloc[i] = '월'       #DAY(요일) 변수를 월요일로
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trend['DATE'].iloc[i]+= datetime.timedelta(days=2)  # DATE_NEW(날짜)변수에 이틀을 추가
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i

### 월요일 데이터들의 변수 값 평균 구하기

- df_trend에서 DAY 변수의 값이 '월'인 행들은 DATE 변수로 GROUPBY 한 뒤 keyword, view_log_like_sum, view_log_like_avg, count, trend 변수의 평균값을 구할 것


In [13]:
# 'DAY' 변수가 '월'인 행 필터링
df_trend_monday = df_trend[df_trend['DAY'] == '월']
df_trend_notmonday = df_trend[df_trend['DAY'] != '월']

# 'DATE' 변수로 그룹화하여 변수들의 평균값 구하기
df_trend_monday = df_trend_monday.groupby('DATE').agg({
    'keyword': 'first',  # 첫 번째 키워드 선택
    'view_log_like_sum': 'mean',
    'view_log_like_avg': 'mean',
    'count': 'mean',
    'trend': 'mean',
    'DAY': 'first'
}).reset_index()

# 결과 출력
df_trend_monday.head()

Unnamed: 0,DATE,keyword,view_log_like_sum,view_log_like_avg,count,trend,DAY
0,2023-01-02,69.27633,0.0,0.0,0.0,4.168034,월
1,2023-01-09,54.72752,128543.987852,32135.996963,1.333333,7.950054,월
2,2023-01-16,40.25447,0.0,0.0,0.0,3.723044,월
3,2023-01-23,36.1107,0.0,0.0,0.0,3.611786,월
4,2023-01-30,43.85233,0.0,0.0,0.0,3.952521,월


In [14]:
# 월요일 데이터, 월요일이 아닌 데이터 다시 합치기 
df_trend = pd.concat([df_trend_monday, df_trend_notmonday], axis=0)

# 행 인덱스 초기화
df_trend = df_trend.reset_index(drop=True)

# 'DATE' 변수로 정렬
df_trend = df_trend.sort_values(by='DATE')

In [15]:
df_trend.nunique()

DATE                 130
keyword              129
view_log_like_sum     39
view_log_like_avg     39
count                  5
trend                130
DAY                    5
dtype: int64

# df_trend, df_sentiment, df_event 의 날짜 변수를 df_stock 에 맞추기 

##  df_stock와 df_trend

In [16]:
set_trend_dates = set(df_trend['DATE'])
set_stock_dates = set(df_stock['DATE'])

# 차이를 확인하여 다른 원소를 추출
difference = set_trend_dates.symmetric_difference(set_stock_dates)

# 결과 출력
print("df_trend와 df_stock의 'DATE' 열에서 다른 원소:")
print(difference)

df_trend와 df_stock의 'DATE' 열에서 다른 원소:
{Timestamp('2023-01-24 00:00:00'), Timestamp('2023-05-05 00:00:00'), Timestamp('2023-01-23 00:00:00'), Timestamp('2023-06-06 00:00:00'), Timestamp('2023-05-29 00:00:00'), Timestamp('2023-03-01 00:00:00'), Timestamp('2023-05-01 00:00:00')}


- 1/23 : 설 연휴
- 1/24 : 설 연휴 
- 3/1 : 삼일절
- 5/1 : 노동절
- 5/5 : 어린이날
- 5/29 : 석가탄신일
- 6/6 : 현충일 
- 해당 일자들은 주식 시장이 개장하지 않는 일자임
- 따라서 df_trend 에서 1/23, 1/24, 1/25에 대한 변수들의 값을 더한 뒤 평균을 내주는 식의 방법을 사용함 

In [17]:
# 1월 23일, 1월 24일, 1월 25일 데이터 필터링
selected_dates = ['2023-01-23', '2023-01-24', '2023-01-25']
filtered_df = df_trend[df_trend['DATE'].isin(selected_dates)]

# 1월 23일, 1월 24일 데이터 삭제
df_trend = df_trend[df_trend['DATE'] != '2023-01-23']
df_trend = df_trend[df_trend['DATE'] != '2023-01-24']

# 1월 25일 데이터를 평균값으로 대체
average_values = filtered_df.mean(numeric_only=True)
df_trend.loc[df_trend['DATE'] == '2023-01-25', ['keyword', 'view_log_like_sum', 'view_log_like_avg', 'count', 'trend']] = average_values.values

In [18]:
# 작업할 날짜 쌍 정의
date_pairs = [('2023-03-01', '2023-03-02'), ('2023-05-01', '2023-05-02'), ('2023-05-05', '2023-05-08'), ('2023-05-29', '2023-05-30'), ('2023-06-06', '2023-06-07')]

# 날짜 쌍에 대한 반복 작업
for start_date, end_date in date_pairs:
    # 선택한 날짜 데이터 필터링
    selected_dates = [start_date, end_date]
    filtered_df = df_trend[df_trend['DATE'].isin(selected_dates)]

    # 시작 날짜 데이터 삭제
    df_trend = df_trend[df_trend['DATE'] != start_date]

    # 끝 날짜 데이터를 평균값으로 대체
    average_values = filtered_df.mean(numeric_only=True)
    df_trend.loc[df_trend['DATE'] == end_date, ['keyword', 'view_log_like_sum', 'view_log_like_avg', 'count', 'trend']] = average_values[['keyword', 'view_log_like_sum', 'view_log_like_avg', 'count', 'trend']].values


In [19]:
# 차이 없는지 확인

set_trend_dates = set(df_trend['DATE'])
set_stock_dates = set(df_stock['DATE'])

# 차이를 확인하여 다른 원소를 추출
difference = set_trend_dates.symmetric_difference(set_stock_dates)

# 결과 출력
print("df_trend와 df_stock의 'DATE' 열에서 다른 원소:")
print(difference)

df_trend와 df_stock의 'DATE' 열에서 다른 원소:
set()


##  df_stock와 df_sentiment

In [20]:
set_sentiment_dates = set(df_sentiment['DATE'])
set_stock_dates = set(df_stock['DATE'])

# 차이를 확인하여 다른 원소를 추출
difference = set_sentiment_dates.symmetric_difference(set_stock_dates)

# 결과 출력
print("set_sentiment_dates df_stock의 'DATE' 열에서 다른 원소:")
print(difference)

set_sentiment_dates df_stock의 'DATE' 열에서 다른 원소:
{Timestamp('2023-01-24 00:00:00'), Timestamp('2023-06-29 00:00:00'), Timestamp('2023-06-27 00:00:00'), Timestamp('2023-05-05 00:00:00'), Timestamp('2023-01-23 00:00:00'), Timestamp('2023-06-06 00:00:00'), Timestamp('2023-06-30 00:00:00'), Timestamp('2023-05-29 00:00:00'), Timestamp('2023-03-01 00:00:00'), Timestamp('2023-05-01 00:00:00'), Timestamp('2023-06-28 00:00:00')}


- 1/23 : 설 연휴
- 1/24 : 설 연휴 
- 3/1 : 삼일절
- 5/5 : 어린이날
- 5/29 : 석가탄신일
- 6/6 : 현충일 
- 해당 일자들은 주식 시장이 개장하지 않는 일자임
- 따라서 df_trend 에서 1/23, 1/24, 1/25에 대한 변수들의 값을 더한 뒤 평균을 내주는 식의 방법을 사용함 

- 3/31 : 
- 6/27 : 
- 6/28 : 
- 6/29 : 
- 6/30 : 
- 해당 일자들은 df_sentiment 값이 존재하지 않는(영상 등이 올라오지 않은) 부분

In [21]:
# 1월 23일, 1월 24일, 1월 25일 데이터 필터링
selected_dates = ['2023-01-23', '2023-01-24', '2023-01-25']
filtered_df = df_sentiment[df_sentiment['DATE'].isin(selected_dates)]

# 1월 23일, 1월 24일 데이터 삭제
df_sentiment = df_sentiment[df_sentiment['DATE'] != '2023-01-23']
df_sentiment = df_sentiment[df_sentiment['DATE'] != '2023-01-24']

# 1월 25일 데이터를 평균값으로 대체
average_values = filtered_df.mean(numeric_only=True)
df_sentiment.loc[df_sentiment['DATE'] == '2023-01-25', 'SENTIMENT_SUM'] = average_values.values

In [22]:
# 작업할 날짜 쌍 정의
date_pairs = [('2023-03-01', '2023-03-02'), ('2023-05-05', '2023-05-08'),('2023-05-29', '2023-05-30'), ('2023-06-06', '2023-06-07')]

# 날짜 쌍에 대한 반복 작업
for start_date, end_date in date_pairs:
    # 선택한 날짜 데이터 필터링
    selected_dates = [start_date, end_date]
    filtered_df = df_sentiment[df_sentiment['DATE'].isin(selected_dates)]

    # 시작 날짜 데이터 삭제
    df_sentiment = df_sentiment[df_sentiment['DATE'] != start_date]

    # 끝 날짜 데이터를 평균값으로 대체
    average_values = filtered_df.mean(numeric_only=True)
    df_sentiment.loc[df_sentiment['DATE'] == end_date, 'SENTIMENT_SUM'] = average_values.values


In [23]:
# 결측치를 제외하고 차이 없는지 확인

set_sentiment_dates = set(df_sentiment['DATE'])
set_stock_dates = set(df_stock['DATE'])

# 차이를 확인하여 다른 원소를 추출
difference = set_sentiment_dates.symmetric_difference(set_stock_dates)

# 결과 출력
print("df_sentiment와 df_stock의 'DATE' 열에서 다른 원소:")
print(difference)

df_sentiment와 df_stock의 'DATE' 열에서 다른 원소:
{Timestamp('2023-06-29 00:00:00'), Timestamp('2023-06-27 00:00:00'), Timestamp('2023-06-30 00:00:00'), Timestamp('2023-05-01 00:00:00'), Timestamp('2023-06-28 00:00:00')}


# DATE 기준으로 데이터프레임 병합

In [24]:
print(df_stock['DATE'].nunique())
print(df_trend['DATE'].nunique())
print(df_sentiment['DATE'].nunique())
print(df_event['DATE'].nunique())

123
123
120
17


In [25]:
# 데이터프레임 병합
merged_df = df_stock.merge(df_trend, on='DATE', how='outer')
merged_df = merged_df.merge(df_sentiment, on='DATE', how='outer')
merged_df = merged_df.merge(df_event, on='DATE', how='outer')

# 날짜로 정렬
merged_df['DATE'] = pd.to_datetime(merged_df['DATE'])
merged_df = merged_df.sort_values(by='DATE')

In [26]:
merged_df

Unnamed: 0,DATE,Open,High,Low,Close,Volume,Change,MACD,Signal,PSAR,...,keyword,view_log_like_sum,view_log_like_avg,count,trend,DAY_x,SENTIMENT_SUM_x,DAY_y,SENTIMENT_SUM_y,DAY
0,2023-01-02,46550.0,49000.0,45950.0,48050.0,1483247.0,0.095781,686.685188,731.113168,42450.000000,...,69.27633,0.000000e+00,0.000000,0.0,4.168034,월,126.027702,월,28.548385,월
1,2023-01-03,47500.0,50800.0,47500.0,49500.0,1176815.0,0.030177,977.776846,780.445904,42450.000000,...,60.65854,0.000000e+00,0.000000,0.0,4.105260,화,119.020563,화,-33.331533,화
2,2023-01-04,49800.0,50000.0,48350.0,48800.0,505379.0,-0.014141,1138.856898,852.128103,42784.000000,...,53.81251,0.000000e+00,0.000000,0.0,3.985506,수,209.513567,수,74.965736,수
3,2023-01-05,49250.0,50100.0,48250.0,49100.0,445150.0,0.006148,1276.012416,936.904965,43104.640000,...,51.66778,0.000000e+00,0.000000,0.0,3.944834,목,277.915244,목,,
4,2023-01-06,48700.0,49900.0,46950.0,49600.0,580137.0,0.010183,1408.815031,1031.286979,43412.454400,...,45.43369,0.000000e+00,0.000000,0.0,3.816254,금,269.611104,금,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,2023-06-26,77600.0,78700.0,76600.0,77200.0,219672.0,-0.010256,-274.988847,1400.729619,83589.187649,...,36.15345,0.000000e+00,0.000000,0.0,3.558157,월,282.396226,월,,
119,2023-06-27,76800.0,80400.0,76500.0,78900.0,438592.0,0.022021,-454.030222,1029.777651,82191.350119,...,36.76734,2.716231e+06,679057.811532,4.0,17.677466,화,,,,
120,2023-06-28,80000.0,80200.0,78100.0,78500.0,246068.0,-0.005070,-621.039502,699.614220,81053.080095,...,37.15977,0.000000e+00,0.000000,0.0,3.615227,수,,,,
121,2023-06-29,78900.0,81800.0,76800.0,78400.0,370929.0,-0.001274,-752.787063,409.133964,76500.000000,...,38.06506,0.000000e+00,0.000000,0.0,3.639297,목,,,,


In [27]:
# 변수 이름 변경
merged_df = merged_df.rename(columns={'SENTIMENT_SUM_x': 'SENTIMENT', 'SENTIMENT_SUM_y': 'EVENT'})
merged_df = merged_df.drop(columns=['DAY', 'DAY_y'])

# DAY_x 열을 DAY로 변경
merged_df = merged_df.rename(columns={'DAY_x': 'DAY'})

# 결과 출력
merged_df

Unnamed: 0,DATE,Open,High,Low,Close,Volume,Change,MACD,Signal,PSAR,...,OBV,FI,keyword,view_log_like_sum,view_log_like_avg,count,trend,DAY,SENTIMENT,EVENT
0,2023-01-02,46550.0,49000.0,45950.0,48050.0,1483247.0,0.095781,686.685188,731.113168,42450.000000,...,2124425.0,6.229637e+09,69.27633,0.000000e+00,0.000000,0.0,4.168034,월,126.027702,28.548385
1,2023-01-03,47500.0,50800.0,47500.0,49500.0,1176815.0,0.030177,977.776846,780.445904,42450.000000,...,3301240.0,1.706382e+09,60.65854,0.000000e+00,0.000000,0.0,4.105260,화,119.020563,-33.331533
2,2023-01-04,49800.0,50000.0,48350.0,48800.0,505379.0,-0.014141,1138.856898,852.128103,42784.000000,...,2795861.0,-3.537653e+08,53.81251,0.000000e+00,0.000000,0.0,3.985506,수,209.513567,74.965736
3,2023-01-05,49250.0,50100.0,48250.0,49100.0,445150.0,0.006148,1276.012416,936.904965,43104.640000,...,3241011.0,1.335450e+08,51.66778,0.000000e+00,0.000000,0.0,3.944834,목,277.915244,
4,2023-01-06,48700.0,49900.0,46950.0,49600.0,580137.0,0.010183,1408.815031,1031.286979,43412.454400,...,3821148.0,2.900685e+08,45.43369,0.000000e+00,0.000000,0.0,3.816254,금,269.611104,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,2023-06-26,77600.0,78700.0,76600.0,77200.0,219672.0,-0.010256,-274.988847,1400.729619,83589.187649,...,10771841.0,-1.757376e+08,36.15345,0.000000e+00,0.000000,0.0,3.558157,월,282.396226,
119,2023-06-27,76800.0,80400.0,76500.0,78900.0,438592.0,0.022021,-454.030222,1029.777651,82191.350119,...,11210433.0,7.456064e+08,36.76734,2.716231e+06,679057.811532,4.0,17.677466,화,,
120,2023-06-28,80000.0,80200.0,78100.0,78500.0,246068.0,-0.005070,-621.039502,699.614220,81053.080095,...,10964365.0,-9.842720e+07,37.15977,0.000000e+00,0.000000,0.0,3.615227,수,,
121,2023-06-29,78900.0,81800.0,76800.0,78400.0,370929.0,-0.001274,-752.787063,409.133964,76500.000000,...,10593436.0,-3.709290e+07,38.06506,0.000000e+00,0.000000,0.0,3.639297,목,,


In [28]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 0 to 122
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   DATE               124 non-null    datetime64[ns]
 1   Open               123 non-null    float64       
 2   High               123 non-null    float64       
 3   Low                123 non-null    float64       
 4   Close              123 non-null    float64       
 5   Volume             123 non-null    float64       
 6   Change             123 non-null    float64       
 7   MACD               123 non-null    float64       
 8   Signal             123 non-null    float64       
 9   PSAR               123 non-null    float64       
 10  upper              123 non-null    float64       
 11  middle             123 non-null    float64       
 12  lower              123 non-null    float64       
 13  SlowK              123 non-null    float64       
 14  SlowD     

In [29]:
merged_df[merged_df['Open'].isnull()]

Unnamed: 0,DATE,Open,High,Low,Close,Volume,Change,MACD,Signal,PSAR,...,OBV,FI,keyword,view_log_like_sum,view_log_like_avg,count,trend,DAY,SENTIMENT,EVENT
123,2023-05-01,,,,,,,,,,...,,,,,,,,,33.329536,


In [30]:
# 제거되지 않은 5월 1일의 데이터를 5월 2일 데이터와 병합
a = float(merged_df[merged_df['DATE'] == '2023-05-01']['EVENT'])
b = float(merged_df[merged_df['DATE'] == '2023-05-02']['EVENT'])
merged_df[merged_df['DATE'] == '2023-05-02']['EVENT'] = (a+b)/2

c = float(merged_df[merged_df['DATE'] == '2023-05-01']['SENTIMENT'])
d = float(merged_df[merged_df['DATE'] == '2023-05-02']['SENTIMENT'])
merged_df[merged_df['DATE'] == '2023-05-02']['SENTIMENT'] = (c+d)/2

merged_df = merged_df[merged_df['DATE'] != '2023-05-01']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[merged_df['DATE'] == '2023-05-02']['EVENT'] = (a+b)/2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[merged_df['DATE'] == '2023-05-02']['SENTIMENT'] = (c+d)/2


In [31]:
merged_df[merged_df['Open'].isnull()]

Unnamed: 0,DATE,Open,High,Low,Close,Volume,Change,MACD,Signal,PSAR,...,OBV,FI,keyword,view_log_like_sum,view_log_like_avg,count,trend,DAY,SENTIMENT,EVENT


# CSV 파일로 저장

In [32]:
merged_df.to_csv("../../data/FINALDATA/yg.csv", index=False)