# 배열 데이터 NumPy


In [3]:
import pandas as pd

In [6]:
import numpy as np

In [9]:
s1 = pd.Series([1, 3, 5, np.nan, 6, 8])
s1


0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [10]:
s1.index


RangeIndex(start=0, stop=6, step=1)

In [11]:
s1.values


array([ 1.,  3.,  5., nan,  6.,  8.])

In [12]:

s1 = pd.Series([np.nan, np.inf, 0, 1, 2, 3, 'a', 'b', 'c'])
s1

0    NaN
1    inf
2      0
3      1
4      2
5      3
6      a
7      b
8      c
dtype: object

In [13]:
s1.values


array([nan, inf, 0, 1, 2, 3, 'a', 'b', 'c'], dtype=object)

## index 설정하기


In [14]:
index_seq = range(10,20)
seq_data = range(10)
s2 = pd.Series(data=seq_data, index=index_seq)
s2

10    0
11    1
12    2
13    3
14    4
15    5
16    6
17    7
18    8
19    9
dtype: int64

In [16]:
s2.values


array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64)

In [15]:
index_date = ['2018-10-17', '2018-10-18', '2018-10-19', '2018-10-20']
s3 = pd.Series([200, 195, np.nan, 205], index=index_date)
s3

2018-10-17    200.0
2018-10-18    195.0
2018-10-19      NaN
2018-10-20    205.0
dtype: float64

In [17]:
pd.date_range('2019-05-25', '2019-05-28')


DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28'], dtype='datetime64[ns]', freq='D')

In [18]:
pd.date_range(start="2019-05-25", periods=8, freq='2D')


DatetimeIndex(['2019-05-25', '2019-05-27', '2019-05-29', '2019-05-31',
               '2019-06-02', '2019-06-04', '2019-06-06', '2019-06-08'],
              dtype='datetime64[ns]', freq='2D')

In [20]:
table_data = {'연도':list(range(2010, 2020)),
              '매출액': [v**2 for i, v in enumerate(range(1,11))],
              '종업원 수': list(range(2, 30, 3))}


df = pd.DataFrame(table_data)
df


Unnamed: 0,연도,매출액,종업원 수
0,2010,1,2
1,2011,4,5
2,2012,9,8
3,2013,16,11
4,2014,25,14
5,2015,36,17
6,2016,49,20
7,2017,64,23
8,2018,81,26
9,2019,100,29


In [21]:
df.head()

Unnamed: 0,연도,매출액,종업원 수
0,2010,1,2
1,2011,4,5
2,2012,9,8
3,2013,16,11
4,2014,25,14


In [22]:
df.tail(3)


Unnamed: 0,연도,매출액,종업원 수
7,2017,64,23
8,2018,81,26
9,2019,100,29


In [23]:
df.index


RangeIndex(start=0, stop=10, step=1)

In [24]:
df.columns


Index(['연도', '매출액', '종업원 수'], dtype='object')

In [25]:
df1 = pd.DataFrame({'봄': [256.5, 264.3, 215.9, 223.2, 312.8],
                    '여름': [770.6, 567.5, 599.8, 387.1, 446.2],
                    '가을': [363.5, 231.2, 293.1, 247.7, 381.6],
                    '겨울': [139.3, 59.9, 76.9, 109.1, 108.1]},
                    index=['2012', '2013', '2014', '2015', '2016'])
df1

Unnamed: 0,봄,여름,가을,겨울
2012,256.5,770.6,363.5,139.3
2013,264.3,567.5,231.2,59.9
2014,215.9,599.8,293.1,76.9
2015,223.2,387.1,247.7,109.1
2016,312.8,446.2,381.6,108.1


In [26]:
df1.sum()


봄     1272.7
여름    2771.2
가을    1517.1
겨울     493.3
dtype: float64

In [27]:
df1.sum(axis=1)


2012    1529.9
2013    1122.9
2014    1185.7
2015     967.1
2016    1248.7
dtype: float64

In [28]:
df1.cumsum()


Unnamed: 0,봄,여름,가을,겨울
2012,256.5,770.6,363.5,139.3
2013,520.8,1338.1,594.7,199.2
2014,736.7,1937.9,887.8,276.1
2015,959.9,2325.0,1135.5,385.2
2016,1272.7,2771.2,1517.1,493.3


In [29]:
df1.describe()

Unnamed: 0,봄,여름,가을,겨울
count,5.0,5.0,5.0,5.0
mean,254.54,554.24,303.42,98.66
std,38.628267,148.888895,67.358496,30.925523
min,215.9,387.1,231.2,59.9
25%,223.2,446.2,247.7,76.9
50%,256.5,567.5,293.1,108.1
75%,264.3,599.8,363.5,109.1
max,312.8,770.6,381.6,139.3


In [30]:
df1.info()


<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 2012 to 2016
Data columns (total 4 columns):
봄     5 non-null float64
여름    5 non-null float64
가을    5 non-null float64
겨울    5 non-null float64
dtypes: float64(4)
memory usage: 360.0+ bytes


In [31]:
df1.loc['2013':, '봄':]

Unnamed: 0,봄,여름,가을,겨울
2013,264.3,567.5,231.2,59.9
2014,215.9,599.8,293.1,76.9
2015,223.2,387.1,247.7,109.1
2016,312.8,446.2,381.6,108.1


In [35]:
df1 = pd.DataFrame({'Class1': [95, 92, 98, 100],
                    'Class2': [91, 93, 98, 100]})

df4 = pd.DataFrame({'Class3': [92,93,94,91,99]})
df4

Unnamed: 0,Class3
0,92
1,93
2,94
3,91
4,99


In [36]:
df1.join(df4)


Unnamed: 0,Class1,Class2,Class3
0,95,91,92
1,92,93,93
2,98,98,94
3,100,100,91


In [37]:
df1.join(df4, how = 'outer')

Unnamed: 0,Class1,Class2,Class3
0,95.0,91.0,92
1,92.0,93.0,93
2,98.0,98.0,94
3,100.0,100.0,91
4,,,99


In [None]:
import os

file_list = []

for i in range(30):
    i+1 = 1
    if i == 1 :

In [66]:
import os 
#df1 = pd.read_csv('./2019-04-30.csv')
cur_dir = os.getcwd()
#data = pd.DataFrame()

for i in range(30):
    i += 1
    if i == 1 :
        data= pd.read_csv('./201904/2019-04-01.csv', error_bad_lines = False, header = None)
    elif i <10 : 
        path = './201904/2019-04-0' + str(i) +'.csv'
        data_path = os.path.join(cur_dir, path)
        data =data.append(pd.read_csv(data_path , error_bad_lines = False, header = None))
        print(data.info())
    else:
        path = './201904/2019-04-' + str(i) +'.csv'
        data_path = os.path.join(cur_dir, path)
        data = data.append(pd.read_csv(data_path , error_bad_lines = False, header = None))

print(data.info())
#df1 = pd.read_csv('.201904/2019-04-01.csv', error_bad_lines = False, header = None)

#df1.head()

b'Skipping line 36560: expected 8 fields, saw 11\n'
b'Skipping line 222462: expected 8 fields, saw 11\n'


<class 'pandas.core.frame.DataFrame'>
Int64Index: 464862 entries, 0 to 256114
Data columns (total 8 columns):
0    464862 non-null object
1    464862 non-null int64
2    464862 non-null object
3    464862 non-null object
4    464862 non-null object
5    464862 non-null int64
6    95788 non-null float64
7    95788 non-null float64
dtypes: float64(2), int64(2), object(4)
memory usage: 31.9+ MB
None


b'Skipping line 27894: expected 8 fields, saw 11\n'


<class 'pandas.core.frame.DataFrame'>
Int64Index: 724849 entries, 0 to 259986
Data columns (total 8 columns):
0    724849 non-null object
1    724849 non-null int64
2    724849 non-null object
3    724849 non-null object
4    724849 non-null object
5    724849 non-null int64
6    165971 non-null float64
7    165971 non-null float64
dtypes: float64(2), int64(2), object(4)
memory usage: 49.8+ MB
None


b'Skipping line 45907: expected 8 fields, saw 11\n'


<class 'pandas.core.frame.DataFrame'>
Int64Index: 950356 entries, 0 to 225506
Data columns (total 8 columns):
0    950356 non-null object
1    950356 non-null int64
2    950356 non-null object
3    950356 non-null object
4    950356 non-null object
5    950356 non-null int64
6    202045 non-null float64
7    202045 non-null float64
dtypes: float64(2), int64(2), object(4)
memory usage: 65.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1174964 entries, 0 to 224607
Data columns (total 8 columns):
0    1174964 non-null object
1    1174964 non-null int64
2    1174964 non-null object
3    1174964 non-null object
4    1174964 non-null object
5    1174964 non-null int64
6    238411 non-null float64
7    238411 non-null float64
dtypes: float64(2), int64(2), object(4)
memory usage: 80.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1430920 entries, 0 to 255955
Data columns (total 8 columns):
0    1430920 non-null object
1    1430920 non-null int64
2    1430920 non-null

b'Skipping line 45538: expected 8 fields, saw 11\n'
b'Skipping line 89789: expected 8 fields, saw 11\n'
b'Skipping line 219259: expected 8 fields, saw 11\n'
b'Skipping line 129303: expected 8 fields, saw 11\n'
b'Skipping line 272309: expected 8 fields, saw 11\n'
b'Skipping line 211460: expected 8 fields, saw 11\n'
b'Skipping line 57879: expected 8 fields, saw 11\n'
b'Skipping line 170099: expected 8 fields, saw 11\n'
b'Skipping line 168991: expected 8 fields, saw 11\n'
b'Skipping line 61227: expected 8 fields, saw 11\n'
b'Skipping line 139967: expected 8 fields, saw 11\n'
b'Skipping line 61480: expected 8 fields, saw 11\n'
b'Skipping line 77728: expected 8 fields, saw 11\n'
b'Skipping line 271889: expected 8 fields, saw 11\n'
b'Skipping line 223096: expected 8 fields, saw 11\n'
b'Skipping line 104855: expected 8 fields, saw 11\n'
b'Skipping line 18081: expected 8 fields, saw 11\n'


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7972751 entries, 0 to 298849
Data columns (total 8 columns):
0    object
1    int64
2    object
3    object
4    object
5    int64
6    float64
7    float64
dtypes: float64(2), int64(2), object(4)
memory usage: 547.4+ MB
None


In [None]:
import os

cur_dir = os.getcwd()
path = './201904/2019-04-' + str(i) +'.csv'
        data_path = os.path.join(cur_dir, path)
        data.append(pd.read_csv(data_path , error_bad_lines = False, header = None))



source_dir = source_dir


input_files = sorted([os.path.join(cur_dir, f) \
for f in os.listdir(cur_dir) \
    if (f.lower().endswith('.csv')) \
        os.path.isfile(os.path.join(self.source_dir, f))])

assert len(self.input_files) > 0



In [64]:
import os

cur_dir = os.getcwd()

os.path.isfile(os.path.join(cur_dir, 



SyntaxError: unexpected EOF while parsing (<ipython-input-64-4f6362c5ee83>, line 6)

In [None]:
data_range('2019-0')