In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
data.index = ['a', 'b', 'c', 'd']

In [4]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [5]:
data['b']

0.5

In [6]:
population_dict = {'California': 38332521, 
                   'Texas': 26448193, 
                   'New York': 19651127, 
                   'Florida':19552860, 
                   'Illinois': 12882135}

In [7]:
population = pd.Series(population_dict) 
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [8]:
population['California']

38332521

In [9]:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [10]:
d = {'col1':[1,2], 'col2':[3,4]}

In [11]:
df = pd.DataFrame(data=d)
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [12]:
df2 = pd.DataFrame(
    np.random.randint(low=0, high=10, size=(5,5)), 
    columns = ['a', 'b', 'c', 'd', 'e'])
df2

Unnamed: 0,a,b,c,d,e
0,4,5,2,8,8
1,4,2,7,3,8
2,7,5,7,6,4
3,8,2,8,0,0
4,4,1,4,1,9


In [13]:
area_dict={'California':423967, 
           'Texas':695662, 
           'New York':141297, 
           'Florida':170312, 
           'Illinois':149995}

In [14]:
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [15]:
states = pd.DataFrame({'population':population, 
                       'area':area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [16]:
dates= pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df = pd.DataFrame(np.random.randn(6,4), 
                  index = dates, 
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2013-01-01,2.758689,0.278113,2.494974,0.010741
2013-01-02,0.325493,-0.970221,-0.881164,1.210782
2013-01-03,1.354842,0.313634,1.224231,-0.235177
2013-01-04,0.885647,-0.297321,-1.628925,0.472148
2013-01-05,-0.883835,0.699636,0.397156,1.072433
2013-01-06,0.888353,2.125925,-1.507256,-1.243995


In [18]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,2.758689,0.278113,2.494974,0.010741
2013-01-02,0.325493,-0.970221,-0.881164,1.210782
2013-01-03,1.354842,0.313634,1.224231,-0.235177
2013-01-04,0.885647,-0.297321,-1.628925,0.472148
2013-01-05,-0.883835,0.699636,0.397156,1.072433


In [19]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.885647,-0.297321,-1.628925,0.472148
2013-01-05,-0.883835,0.699636,0.397156,1.072433
2013-01-06,0.888353,2.125925,-1.507256,-1.243995


In [20]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [21]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [22]:
df.values

array([[ 2.7586886 ,  0.27811306,  2.49497409,  0.01074096],
       [ 0.3254933 , -0.97022098, -0.88116387,  1.21078182],
       [ 1.3548424 ,  0.31363401,  1.22423075, -0.23517736],
       [ 0.88564719, -0.29732101, -1.62892456,  0.47214846],
       [-0.88383453,  0.69963593,  0.39715609,  1.07243288],
       [ 0.88835281,  2.1259253 , -1.5072558 , -1.24399463]])

In [23]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.888198,0.358294,0.016503,0.214489
std,1.197767,1.043479,1.648114,0.912792
min,-0.883835,-0.970221,-1.628925,-1.243995
25%,0.465532,-0.153462,-1.350733,-0.173698
50%,0.887,0.295874,-0.242004,0.241445
75%,1.23822,0.603135,1.017462,0.922362
max,2.758689,2.125925,2.494974,1.210782


In [24]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,0.010741,2.494974,0.278113,2.758689
2013-01-02,1.210782,-0.881164,-0.970221,0.325493
2013-01-03,-0.235177,1.224231,0.313634,1.354842
2013-01-04,0.472148,-1.628925,-0.297321,0.885647
2013-01-05,1.072433,0.397156,0.699636,-0.883835
2013-01-06,-1.243995,-1.507256,2.125925,0.888353


In [25]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-02,0.325493,-0.970221,-0.881164,1.210782
2013-01-04,0.885647,-0.297321,-1.628925,0.472148
2013-01-01,2.758689,0.278113,2.494974,0.010741
2013-01-03,1.354842,0.313634,1.224231,-0.235177
2013-01-05,-0.883835,0.699636,0.397156,1.072433
2013-01-06,0.888353,2.125925,-1.507256,-1.243995


In [26]:
df.loc[dates[0]]

A    2.758689
B    0.278113
C    2.494974
D    0.010741
Name: 2013-01-01 00:00:00, dtype: float64

In [27]:
df.loc[:,['A', 'B']]

Unnamed: 0,A,B
2013-01-01,2.758689,0.278113
2013-01-02,0.325493,-0.970221
2013-01-03,1.354842,0.313634
2013-01-04,0.885647,-0.297321
2013-01-05,-0.883835,0.699636
2013-01-06,0.888353,2.125925


In [28]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,0.325493,-0.970221
2013-01-03,1.354842,0.313634
2013-01-04,0.885647,-0.297321


In [29]:
df.iloc[3]

A    0.885647
B   -0.297321
C   -1.628925
D    0.472148
Name: 2013-01-04 00:00:00, dtype: float64

In [30]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.885647,-0.297321
2013-01-05,-0.883835,0.699636


In [31]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,2.758689,0.278113,2.494974,0.010741
2013-01-02,0.325493,,,1.210782
2013-01-03,1.354842,0.313634,1.224231,
2013-01-04,0.885647,,,0.472148
2013-01-05,,0.699636,0.397156,1.072433
2013-01-06,0.888353,2.125925,,


In [32]:
df2 = df.copy()

In [33]:
df2['E']= ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,2.758689,0.278113,2.494974,0.010741,one
2013-01-02,0.325493,-0.970221,-0.881164,1.210782,one
2013-01-03,1.354842,0.313634,1.224231,-0.235177,two
2013-01-04,0.885647,-0.297321,-1.628925,0.472148,three
2013-01-05,-0.883835,0.699636,0.397156,1.072433,four
2013-01-06,0.888353,2.125925,-1.507256,-1.243995,three


In [34]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,1.354842,0.313634,1.224231,-0.235177,two
2013-01-05,-0.883835,0.699636,0.397156,1.072433,four


In [35]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [36]:
df['F']=s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,2.758689,0.278113,2.494974,0.010741,
2013-01-02,0.325493,-0.970221,-0.881164,1.210782,1.0
2013-01-03,1.354842,0.313634,1.224231,-0.235177,2.0
2013-01-04,0.885647,-0.297321,-1.628925,0.472148,3.0
2013-01-05,-0.883835,0.699636,0.397156,1.072433,4.0
2013-01-06,0.888353,2.125925,-1.507256,-1.243995,5.0


In [37]:
df.at[dates[0], 'A']=0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.278113,2.494974,0.010741,
2013-01-02,0.325493,-0.970221,-0.881164,1.210782,1.0
2013-01-03,1.354842,0.313634,1.224231,-0.235177,2.0
2013-01-04,0.885647,-0.297321,-1.628925,0.472148,3.0
2013-01-05,-0.883835,0.699636,0.397156,1.072433,4.0
2013-01-06,0.888353,2.125925,-1.507256,-1.243995,5.0


In [38]:
df.iat[0,1]=0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,2.494974,0.010741,
2013-01-02,0.325493,-0.970221,-0.881164,1.210782,1.0
2013-01-03,1.354842,0.313634,1.224231,-0.235177,2.0
2013-01-04,0.885647,-0.297321,-1.628925,0.472148,3.0
2013-01-05,-0.883835,0.699636,0.397156,1.072433,4.0
2013-01-06,0.888353,2.125925,-1.507256,-1.243995,5.0


In [39]:
df.loc[:, 'D']=np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,2.494974,5,
2013-01-02,0.325493,-0.970221,-0.881164,5,1.0
2013-01-03,1.354842,0.313634,1.224231,5,2.0
2013-01-04,0.885647,-0.297321,-1.628925,5,3.0
2013-01-05,-0.883835,0.699636,0.397156,5,4.0
2013-01-06,0.888353,2.125925,-1.507256,5,5.0


In [40]:
df.mean()

A    0.428417
B    0.311942
C    0.016503
D    5.000000
F    3.000000
dtype: float64

In [41]:
df.mean(axis=1)

2013-01-01    1.873744
2013-01-02    0.894822
2013-01-03    1.978541
2013-01-04    1.391880
2013-01-05    1.842591
2013-01-06    2.301404
Freq: D, dtype: float64

In [42]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,2.494974,5,
2013-01-02,0.325493,-0.970221,1.61381,10,1.0
2013-01-03,1.680336,-0.656587,2.838041,15,3.0
2013-01-04,2.565983,-0.953908,1.209116,20,6.0
2013-01-05,1.682148,-0.254272,1.606272,25,10.0
2013-01-06,2.570501,1.871653,0.099017,30,15.0


In [43]:
df.apply(lambda x: x.max()-x.min())

A    2.238677
B    3.096146
C    4.123899
D    0.000000
F    4.000000
dtype: float64

In [44]:
df['A'].value_counts()

 0.000000    1
 0.325493    1
 1.354842    1
 0.885647    1
-0.883835    1
 0.888353    1
Name: A, dtype: int64

In [45]:
left = pd.DataFrame({'key':['foo','bar'], 'lval':[1,2]})

In [46]:
right = pd.DataFrame({'key':['foo', 'bar'], 'rval':[4,5]})

In [47]:
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [48]:
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [49]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


In [50]:
df = pd.DataFrame(np.random.randn(8,4), 
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0.250835,0.005711,-0.308808,1.321909
1,0.76312,1.994763,-0.921085,-1.089254
2,2.143601,-0.466768,-0.616257,0.41356
3,1.762505,-1.635557,-0.338073,0.347718
4,-0.531461,0.313589,1.760925,-0.193183
5,0.636217,-0.046852,0.357874,-0.505763
6,-1.126569,-1.530734,-1.378126,1.321969
7,1.681342,0.524044,0.613941,-0.392417


In [51]:
s = df.iloc[3]

In [52]:
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,0.250835,0.005711,-0.308808,1.321909
1,0.76312,1.994763,-0.921085,-1.089254
2,2.143601,-0.466768,-0.616257,0.41356
3,1.762505,-1.635557,-0.338073,0.347718
4,-0.531461,0.313589,1.760925,-0.193183
5,0.636217,-0.046852,0.357874,-0.505763
6,-1.126569,-1.530734,-1.378126,1.321969
7,1.681342,0.524044,0.613941,-0.392417
8,1.762505,-1.635557,-0.338073,0.347718


In [53]:
df = pd.DataFrame({'A':['foo','bar', 
                        'foo', 'bar', 
                        'foo', 'bar', 
                        'foo', 'bar'], 
                'B':['one', 'one', 
                     'two', 'three', 
                     'two', 'two', 'one', 'three'], 
                   'C':np.random.randn(8), 
                   'D':np.random.randn(8)})

In [54]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.187785,0.095267
1,bar,one,-0.157241,-1.432112
2,foo,two,-0.239852,-0.254292
3,bar,three,-0.437276,-0.131091
4,foo,two,0.520234,0.58249
5,bar,two,-0.014814,-0.890389
6,foo,one,0.659944,0.643184
7,bar,three,-0.917873,0.792226


In [55]:
df.groupby('A')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000011A988B12B0>

In [56]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.527204,-1.661365
foo,-0.247458,1.06665


In [57]:
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.157241,-1.432112
bar,three,-1.355149,0.661136
bar,two,-0.014814,-0.890389
foo,one,-0.527841,0.738452
foo,two,0.280383,0.328198


In [60]:
pd.read_csv('dataset.csv')

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageCars,GarageArea,WoodDeckSF,SalePrice
0,60,8450,7,5,2003,2003,706,0,150,856,856,854,1710,2,548,0,208500
1,20,9600,6,8,1976,1976,978,0,284,1262,1262,0,1262,2,460,298,181500
2,60,11250,7,5,2001,2002,486,0,434,920,920,866,1786,2,608,0,223500
3,70,9550,7,5,1915,1970,216,0,540,756,961,756,1717,3,642,0,140000
4,60,14260,8,5,2000,2000,655,0,490,1145,1145,1053,2198,3,836,192,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,7917,6,5,1999,2000,0,0,953,953,953,694,1647,2,460,0,175000
1456,20,13175,6,6,1978,1988,790,163,589,1542,2073,0,2073,2,500,349,210000
1457,70,9042,7,9,1941,2006,275,0,877,1152,1188,1152,2340,1,252,0,266500
1458,20,9717,5,6,1950,1996,49,1029,0,1078,1078,0,1078,1,240,366,142125


In [59]:
df.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageCars,GarageArea,WoodDeckSF,SalePrice
0,60,8450,7,5,2003,2003,706,0,150,856,856,854,1710,2,548,0,208500
1,20,9600,6,8,1976,1976,978,0,284,1262,1262,0,1262,2,460,298,181500
2,60,11250,7,5,2001,2002,486,0,434,920,920,866,1786,2,608,0,223500
3,70,9550,7,5,1915,1970,216,0,540,756,961,756,1717,3,642,0,140000
4,60,14260,8,5,2000,2000,655,0,490,1145,1145,1053,2198,3,836,192,250000
