# 데이터 처리
---

In [0]:
import datetime
data = [{
            'closing_price': 102.06,
            'date': datetime.datetime(2014, 8, 29, 0, 0),
            'symbol': 'AAPL'
        },
        {
            'closing_price': 231.23,
            'date': datetime.datetime(2020, 7, 13, 0, 0),
            'symbol': 'LINUX'
        },
        {
            'closing_price': 11,
            'date': datetime.datetime(2014, 8, 29, 0, 0),
            'symbol': 'AAPL'
        },
        {
            'closing_price': 5932.342,
            'date': datetime.datetime(2020, 2, 22, 2, 0),
            'symbol': 'AAPL'
        }]

In [6]:
data

[{'closing_price': 102.06,
  'date': datetime.datetime(2014, 8, 29, 0, 0),
  'symbol': 'AAPL'},
 {'closing_price': 231.23,
  'date': datetime.datetime(2020, 7, 13, 0, 0),
  'symbol': 'LINUX'},
 {'closing_price': 11,
  'date': datetime.datetime(2014, 8, 29, 0, 0),
  'symbol': 'AAPL'},
 {'closing_price': 5932.342,
  'date': datetime.datetime(2020, 2, 22, 2, 0),
  'symbol': 'AAPL'}]

In [0]:
# APL 이 symbol 인 데이터중 max price
max_aapl_price = max(row["closing_price"] for row in data if row["symbol"]=="AAPL")

In [8]:
max_aapl_price

5932.342

In [10]:
from collections import defaultdict

# symbol을 기준으로 dictionary 생성
by_symbol = defaultdict(list)
for row in data:
  by_symbol[row["symbol"]].append(row)
print("by_symbol :",by_symbol)

# 그룹마다의 max price를 저장
max_price_by_symbol = {
    symbol: 
      max(row["closing_price"] for row in grouped_rows) 
        for symbol, grouped_rows in by_symbol.items()
    }
print("max_price_by_symbol :",max_price_by_symbol)

by_symbol : defaultdict(<class 'list'>, {'AAPL': [{'closing_price': 102.06, 'date': datetime.datetime(2014, 8, 29, 0, 0), 'symbol': 'AAPL'}, {'closing_price': 11, 'date': datetime.datetime(2014, 8, 29, 0, 0), 'symbol': 'AAPL'}, {'closing_price': 5932.342, 'date': datetime.datetime(2020, 2, 22, 2, 0), 'symbol': 'AAPL'}], 'LINUX': [{'closing_price': 231.23, 'date': datetime.datetime(2020, 7, 13, 0, 0), 'symbol': 'LINUX'}]})
max_price_by_symbol : {'AAPL': 5932.342, 'LINUX': 231.23}


In [0]:
def picker(field_name):  
    return lambda row: row[field_name]

def pluck(field_name, rows):
    return map(picker(field_name), rows)

In [22]:
list(pluck("date", data))  # data에서 date에 해당하는 필드만 가지고 와서 list로 만들어 줌

[datetime.datetime(2014, 8, 29, 0, 0),
 datetime.datetime(2020, 7, 13, 0, 0),
 datetime.datetime(2014, 8, 29, 0, 0),
 datetime.datetime(2020, 2, 22, 2, 0)]

In [0]:
def group_by(grouper, rows, value_transform=None):
  grouped = defaultdict(list)
  for row in rows:
    grouped[grouper(row)].append(row)
  if value_transform is None:
    return grouped
  else:
    return {key : value_transform(rows) for key, rows in grouped.items()}

In [24]:
group_by(picker("symbol"), data, lambda rows: max(pluck("closing_price", rows))) # max price group by symbol

{'AAPL': 5932.342, 'LINUX': 231.23}

In [25]:
group_by(picker("symbol"), data) # group by symbol

defaultdict(list,
            {'AAPL': [{'closing_price': 102.06,
               'date': datetime.datetime(2014, 8, 29, 0, 0),
               'symbol': 'AAPL'},
              {'closing_price': 11,
               'date': datetime.datetime(2014, 8, 29, 0, 0),
               'symbol': 'AAPL'},
              {'closing_price': 5932.342,
               'date': datetime.datetime(2020, 2, 22, 2, 0),
               'symbol': 'AAPL'}],
             'LINUX': [{'closing_price': 231.23,
               'date': datetime.datetime(2020, 7, 13, 0, 0),
               'symbol': 'LINUX'}]})

In [0]:
def percent_price_change(yesterday, today):
  return today["closing_price"] / yesterday["closing_price"] - 1
def day_over_day_changes(grouped_rows):
  ordered = sorted(grouped_rows, key=picker("date"))
  return [{
      "symbol": today["symbol"],
      "date": today["date"],
      "change": percent_price_change(yesterday, today)} for yesterday, today in zip(ordered, ordered[1:])]

In [33]:
changes_by_symbol = group_by(picker("symbol"), data, day_over_day_changes)
changes_by_symbol

{'AAPL': [{'change': -0.8922202625906329,
   'date': datetime.datetime(2014, 8, 29, 0, 0),
   'symbol': 'AAPL'},
  {'change': 538.3038181818182,
   'date': datetime.datetime(2020, 2, 22, 2, 0),
   'symbol': 'AAPL'}],
 'LINUX': []}

#척도조절
---


In [0]:
import numpy as np

In [0]:
def scale(data_matrix): # 각 열의 평균과 표준편차를 구하는 함수
  num_rows, num_cols = np.shape(data_matrix)
  means = [np.mean(data_matrix[:,j]) for j in range(num_cols)]
  stdevs = [np.std(data_matrix[:,j]) for j in range(num_cols)]
  return means, stdevs

In [0]:
def rescale(data_matrix):
  mean, stdevs = scale(data_matrix)
  def rescaled(i, j, mean, stdevs): # 평균과 표준편차를 0과 1로 변환
    if stdevs[j] > 0:
      return (data_matrix[i][j] - mean[j])/stdevs[j]
    else:
      return data_matrix[i][j] # 편차가 없을 경우 그대로 유지
  def make_matrix(nr, nc, rescaled, mean, stdevs):
    mat = np.zeros((nr, nc))
    for i in range(nr):
      for j in range(nc):
        mat[i][j] = rescaled(i, j, mean, stdevs)
    return mat
  num_rows, num_cols = np.shape(data_matrix)
  return make_matrix(num_rows, num_cols, rescaled, mean, stdevs) # 0을 기준으로 scale 된 matrix

In [19]:
mat = np.array([[63,160,150],[67,170.2,160],[70,177.8,171]])
scale(mat)

([66.66666666666667, 169.33333333333334, 160.33333333333334],
 [2.8674417556808756, 7.2926142241463925, 8.576453553512405])

In [35]:
rescale(mat)

array([[-1.27872403, -1.27983368, -1.20484922],
       [ 0.11624764,  0.1188417 , -0.0388661 ],
       [ 1.16247639,  1.16099199,  1.24371532]])