# 2.1 파이썬(Python)

### Python은 동적언어

In [15]:
a = 10
a

10

In [16]:
type(a)

int

- 변수 타입 선언 필요 없음

In [17]:
a = 'Hello Python!'
print(a)
type(a)

Hello Python!


str

- 동일한 변수 a에 문자타입을 입력 -> 동적으로 변화되는것 알 수 있음

---

# 2.2 넘파이(NumPy)

### NumPy로 숫자배열 생성 (1차원)

In [18]:
import numpy as np

In [19]:
# Vector
# - Rank : 1
v = np.array([0, 1, 2]) # 정수
v1 = np.array([0, 1, 2], np.float32) # 부동소수
v2 = np.array([0.0, 1.0, 2.0]) # 부동소수

print(v)
print('------------')
print(v1)
print('------------')
print(v2)

[0 1 2]
------------
[0. 1. 2.]
------------
[0. 1. 2.]


### NumPy로 숫자배열 생성 (2차원)

In [20]:
# Matrix
# - Rank : 2
m = np.array([[0, 1, 2],
              [3, 4, 5],
              [6, 7, 8]])
m1 = m.astype(np.float64) # 변수타입 변경 : 정수 -> 부동소수

print(m)
print('------------')
print(m1)

[[0 1 2]
 [3 4 5]
 [6 7 8]]
------------
[[0. 1. 2.]
 [3. 4. 5.]
 [6. 7. 8.]]


### NumPy로 숫자배열 생성 (3차원)

In [21]:
# 3-Tensor
# - Rank : 3
t = np.array([ [ [0, 1], [2, 1], [4, 1] ],
               [ [1, 2], [3, 2], [5, 2] ],
               [ [2, 3], [4, 3], [6,3 ] ] ])

print(t)

[[[0 1]
  [2 1]
  [4 1]]

 [[1 2]
  [3 2]
  [5 2]]

 [[2 3]
  [4 3]
  [6 3]]]


- 인덱싱   : `a[index num]`, `a[row num, col num]`
- 슬라이싱 : `a[start index, end index+1]`
- 차원확인 : `a.ndim()`
- 모양확인 : `a.shape()`
- 모양변경 : `a.reshape()`

### Indexing / Slicing

In [22]:
a = [2, 3, 4, 7, 11]

In [23]:
a[1]

3

In [24]:
a[2:4] # 2부터 3까지 (4제외)

[4, 7]

In [25]:
a[1:-1] # -1 제외

[3, 4, 7]

In [26]:
a[-4:]

[3, 4, 7, 11]

In [27]:
a[:-1]

[2, 3, 4, 7]

In [28]:
a[::2]

[2, 4, 11]

In [29]:
a[::-2]

[11, 4, 2]

In [30]:
arr2d = np.array([ [1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16] ])

In [31]:
print(arr2d)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]]


In [32]:
arr2d[2, :]

array([ 9, 10, 11, 12])

In [33]:
arr2d[1:3, ]

array([[ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [34]:
arr2d[:, 3]

array([ 4,  8, 12, 16])

In [35]:
arr2d[:, :2]

array([[ 1,  2],
       [ 5,  6],
       [ 9, 10],
       [13, 14]])

In [36]:
arr2d[3, 2]

15

### 차원 확인

In [37]:
# ndim : 차원 확인
print("v", v.ndim)
print("m", m.ndim)
print("t", t.ndim)

v 1
m 2
t 3


### 모양 확인 : shape

In [38]:
# Shape : 모양 확인
print("v:", v.shape)
print("m:", m.shape)
print("t:", t.shape)

v: (3,)
m: (3, 3)
t: (3, 3, 2)


In [39]:
print("rownum:", m.shape[0])
print('colnum:', m.shape[1])

rownum: 3
colnum: 3


### 모양 재변경 : reshape

In [40]:
# Reshape : 모양 재설정
print(v.reshape(3,1), "\n", v.shape, "=>", v.reshape(3,1).shape, "\n")
print(m.reshape(1,9), "\n", m.shape, "=>", m.reshape(1,9).shape, "\n")
print(t.reshape(1,2,9), "\n", t.shape, "=>", t.reshape(1,2,9).shape, "\n")

[[0]
 [1]
 [2]] 
 (3,) => (3, 1) 

[[0 1 2 3 4 5 6 7 8]] 
 (3, 3) => (1, 9) 

[[[0 1 2 1 4 1 1 2 3]
  [2 5 2 2 3 4 3 6 3]]] 
 (3, 3, 2) => (1, 2, 9) 



### Broadcasting
- Array 연산은 기본적으로 동일한 matrix 끼리 연산
- 연산의 크기가 달라도 Broadcast 기능으로 크기가 다른 array 연산 가능

In [41]:
# Broadcasting : numpy array의 핵심적인 장점, 벡터화 계산을 가능하게 해줘 연산시 발생하는 부하가 적게 설계되어 있음
print(v, "* 2 =", v+2, "\n")

print(m, "+", v, "\n=", m+v)

[0 1 2] * 2 = [2 3 4] 

[[0 1 2]
 [3 4 5]
 [6 7 8]] + [0 1 2] 
= [[ 0  2  4]
 [ 3  5  7]
 [ 6  8 10]]


---

# 2.3 판다스(pandas)
- 데이터를 분석하기 위해 1차원의 데이터 자료구조
- Series는 데이터들을 표 형태로 왼쪽에 인덱스, 오른쪽의 인덱스 값으로 함수 안에 수를 표현함

In [42]:
from pandas import Series

price = [42500, 42550, 41800]
year = ["2020", "2021", "2022"]

s = Series(price, year)

print(s)
print('------------------------------------------------')
print(s.index)
print('------------------------------------------------')
print(s.values)

2020    42500
2021    42550
2022    41800
dtype: int64
------------------------------------------------
Index(['2020', '2021', '2022'], dtype='object')
------------------------------------------------
[42500 42550 41800]


In [43]:
import pandas as pd

data = {
    'price' : [42500, 42550, 41800],
    'year' : ["2020", "2021", "2022"]
}

df = pd.DataFrame(data, index=data['year'])
df

Unnamed: 0,price,year
2020,42500,2020
2021,42550,2021
2022,41800,2022


---

# 2.4 데이터프레임(DataFrame)

## 확인하기

### 2차원의 데이터 자료구조

In [44]:
import pandas as pd

df = pd.DataFrame({'학번' : ['123456', '345678', '125690'],
                   '성명' : ['홍길동', '김영희', '박철수'],
                   '학년' : [1, 2, 3]})

df = df.set_index('학번')
df

Unnamed: 0_level_0,성명,학년
학번,Unnamed: 1_level_1,Unnamed: 2_level_1
123456,홍길동,1
345678,김영희,2
125690,박철수,3


In [45]:
# 특정 열
df['성명']

학번
123456    홍길동
345678    김영희
125690    박철수
Name: 성명, dtype: object

In [46]:
# 특정 열
df.성명

학번
123456    홍길동
345678    김영희
125690    박철수
Name: 성명, dtype: object

In [47]:
# 특정 행
df.iloc[1: , :]

Unnamed: 0_level_0,성명,학년
학번,Unnamed: 1_level_1,Unnamed: 2_level_1
345678,김영희,2
125690,박철수,3


In [48]:
# 특정 행
df.loc['345678':'125690', '성명':'학년']

Unnamed: 0_level_0,성명,학년
학번,Unnamed: 1_level_1,Unnamed: 2_level_1
345678,김영희,2
125690,박철수,3


In [49]:
# 특정 값
df.iloc[0,0]

'홍길동'

In [50]:
# 특정 값
df[df['학년'] > 1]

Unnamed: 0_level_0,성명,학년
학번,Unnamed: 1_level_1,Unnamed: 2_level_1
345678,김영희,2
125690,박철수,3


In [51]:
# 특정 값
df[(df['학년'] > 1) | (df['성명'] == '홍길동')]

Unnamed: 0_level_0,성명,학년
학번,Unnamed: 1_level_1,Unnamed: 2_level_1
123456,홍길동,1
345678,김영희,2
125690,박철수,3


- | : or

## 조작하기

### 2차원의 데이터 자료구조

In [52]:
import pandas as pd

df = pd.DataFrame({'학번' : ['123456', '345678', '125690'],
                   '성명' : ['홍길동', '김영희', '박철수'],
                   '학년' : [1, 2, 3]})

df = df.set_index('학번')
df

Unnamed: 0_level_0,성명,학년
학번,Unnamed: 1_level_1,Unnamed: 2_level_1
123456,홍길동,1
345678,김영희,2
125690,박철수,3


In [53]:
# 열 추가
import pandas as pd

s = pd.Series(data=[95, 75, 80], index=df.index)

df['학점'] = s

df

Unnamed: 0_level_0,성명,학년,학점
학번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
123456,홍길동,1,95
345678,김영희,2,75
125690,박철수,3,80


In [54]:
# 행 추가
s = pd.Series(data = ['아이유', '4', 99], index = df.columns, name='098765')
df = df.append(s)
df

Unnamed: 0_level_0,성명,학년,학점
학번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
123456,홍길동,1,95
345678,김영희,2,75
125690,박철수,3,80
98765,아이유,4,99


In [55]:
# 변경
df.loc['098765', '학점'] = 96
df

Unnamed: 0_level_0,성명,학년,학점
학번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
123456,홍길동,1,95
345678,김영희,2,75
125690,박철수,3,80
98765,아이유,4,96


In [56]:
df.iloc[3,2] = 97
df

Unnamed: 0_level_0,성명,학년,학점
학번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
123456,홍길동,1,95
345678,김영희,2,75
125690,박철수,3,80
98765,아이유,4,97


In [57]:
df.drop("학점", axis=1)
df

Unnamed: 0_level_0,성명,학년,학점
학번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
123456,홍길동,1,95
345678,김영희,2,75
125690,박철수,3,80
98765,아이유,4,97


In [58]:
df.drop("123456", axis=0)
df

Unnamed: 0_level_0,성명,학년,학점
학번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
123456,홍길동,1,95
345678,김영희,2,75
125690,박철수,3,80
98765,아이유,4,97


In [59]:
df.rename(columns = {'성명' : '이름'}, inplace=True)
df

Unnamed: 0_level_0,이름,학년,학점
학번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
123456,홍길동,1,95
345678,김영희,2,75
125690,박철수,3,80
98765,아이유,4,97


## Merge
- 두 데이터프레임을 특정 컬럼의 값을 기준으로 데이터 병합

In [60]:
df2 = pd.DataFrame({'학번' : ['123456', '345678', '125690', "098765"],
                    'HP' : ['010-1234-5678', '010-1111-2222', '010-4545-5656', '010-0987-1234']})
df2 = df2.set_index('학번')
df2

Unnamed: 0_level_0,HP
학번,Unnamed: 1_level_1
123456,010-1234-5678
345678,010-1111-2222
125690,010-4545-5656
98765,010-0987-1234


In [61]:
df

Unnamed: 0_level_0,이름,학년,학점
학번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
123456,홍길동,1,95
345678,김영희,2,75
125690,박철수,3,80
98765,아이유,4,97


In [62]:
df2

Unnamed: 0_level_0,HP
학번,Unnamed: 1_level_1
123456,010-1234-5678
345678,010-1111-2222
125690,010-4545-5656
98765,010-0987-1234


In [63]:
df_merge = pd.merge(df, df2, on='학번')
df_merge

Unnamed: 0_level_0,이름,학년,학점,HP
학번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
123456,홍길동,1,95,010-1234-5678
345678,김영희,2,75,010-1111-2222
125690,박철수,3,80,010-4545-5656
98765,아이유,4,97,010-0987-1234


- on : 두 데이터프레임 병합의 기준열
 - 기준열명이 상이할 경우, left_on / right_on 명시
- JOIN 방식 : left, right, inner, outer

## Pivot
- 인덱스, 컬럼 및 데이터를 지정하여 데이터를 재구조화

In [64]:
import pandas as pd
df = pd.DataFrame({'foo' : ['one', 'one', 'one', 'two', 'two'],
                    'bar' : ['A', 'B', 'C', 'A', 'B'],
                    'baz' : ['1', '2', '3', '4', '5'],
                    'zoo' : ['x', 'y', 'z', 'q', 'w']})
df

Unnamed: 0,foo,bar,baz,zoo
0,one,A,1,x
1,one,B,2,y
2,one,C,3,z
3,two,A,4,q
4,two,B,5,w


In [65]:
pd.pivot(data=df, index='foo', columns='bar', values='baz')

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3.0
two,4,5,


In [66]:
df.pivot(index='foo', columns='bar', values='baz')

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3.0
two,4,5,


## 파일 관리

In [85]:
# 파일 데이터 불러오기
import pandas as pd
import numpy as np
import csv

df = pd.read_csv('파일이름.csv', index_col=False)

df2 = pd.read_excel('파일이름.xlsx', index_col=False)

In [68]:
# 데이터 파일로 저장
df_merge

Unnamed: 0_level_0,이름,학년,학점,HP
학번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
123456,홍길동,1,95,010-1234-5678
345678,김영희,2,75,010-1111-2222
125690,박철수,3,80,010-4545-5656
98765,아이유,4,97,010-0987-1234


In [69]:
df.to_csv('학생정보.csv', index = False)

**해당경로에 파일 저장 확인가능**

---

# 2.5 데이터 관련 함수

## Data Type & Value
- `df.shape` : 데이터프레임의 형태
- `df.info()` : 데이터프레임을 구성하는 컬럼과 데이터 타입 정보
- `df.nunique()` : 데이터프레임 컬럼별 포함된 데이터 값의 개수
- `df.value_counts()` : 특정 컬럼에 포함된 데이터 값과 그에 대한 개수
- `df.head()` : 데이터 값 확인
- `df.dtypes` : dataframe내의 컬럼 별 데이터 타입을 확인

In [70]:
import seaborn as sns

titanic = sns.load_dataset('titanic')
print(titanic.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [71]:
titanic.shape

(891, 15)

In [72]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [73]:
titanic.nunique()

survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64

In [74]:
titanic['survived'].value_counts()

0    549
1    342
Name: survived, dtype: int64

## 요약 통계량 확인
- `df.describe()` : 각 열의 개수, 평균, 표준편차, 최소값, 최대값, 4분위수의 요약 통계량 확인
- `df.describe(include=['object'])` : 범주형 데이터의 요약 통계량 확인
- `df.describe(percentiles = [.30, .75, .95, .99])` : 특정 비율에 따른 요약 통계량 확인

In [75]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


## 결측치 확인 & 처리
- `df.isnull().sum()`: pandas의 isnull(), sum() 메소드를 호출하여 dataframe에 컬럼별 결측치 개수
- `df.fillna(0)`: 값이 없는 것은 0으로 변환
- `df.replace()`: 첫번째 인자에 해당하는 문자열을 두번째 인자 문자열로 치환

In [76]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [81]:
# 'Age'필드의 중간값을 가져와서 fillna() 함수를 사용하여 'Age'필드의 중간값으로 'Age'필드의 결측치를 바꾼다.
titanic['age'] = titanic['age'].fillna(titanic['age'].median()) 

In [84]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [82]:
titanic['age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [88]:
# 나이('Age') 필드를 그룹핑하여 'AgeGroup' 필드 생성하여 할당하기
# AgeGroup 컬럼의 값들을 bin에 맞게 잘라서 각 bin을 group_names와 1대1 매칭시켜 분류한다.

bin = [0, 18, 25, 35, 60, 100]
group_names = ['Baby', 'Youth', 'YoungAdult', 'MiddleAged', 'Senior']
titanic['AgeGroup'] = pd.cut(titanic['age'], bins=bin, labels=group_names)
titanic['AgeGroup'].value_counts()

YoungAdult    373
MiddleAged    195
Youth         162
Baby          139
Senior         22
Name: AgeGroup, dtype: int64

In [89]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,AgeGroup
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,Youth
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,MiddleAged
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,YoungAdult
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,YoungAdult
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,YoungAdult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,YoungAdult
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,Youth
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,YoungAdult
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,YoungAdult


## 집계분석
- `df.groupby('컬럼명').count()` : 개수
- `df.groupby('컬럼명').mean()` : 평균
- `df.groupby('컬럼명').sum()` : 합계
- `df[(df['컬럼명']==1)]` : 컬럼의 값 = 1 검색

In [77]:
titanic.groupby('survived').count()

Unnamed: 0_level_0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,549,549,424,549,549,549,549,549,549,549,67,549,549,549
1,342,342,290,342,342,342,340,342,342,342,136,340,342,342


In [79]:
titanic.groupby('survived').mean()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,adult_male,alone
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2.531876,30.626179,0.553734,0.32969,22.117887,0.817851,0.681239
1,1.950292,28.34369,0.473684,0.464912,48.395408,0.25731,0.476608


In [80]:
titanic.groupby('survived').sum()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,adult_male,alone
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1390,12985.5,304,181,12142.7199,449,374
1,667,8219.67,162,159,16551.2294,88,163
