In [1]:
import pandas as pd

# pandas series

In [5]:
ser = pd.Series(data=[100,'foo',300,400,500], index=[666, 'bob', 'nancy', 'dan', 'eric'])
ser

666      100
bob      foo
nancy    300
dan      400
eric     500
dtype: object

In [4]:
ser.index

Index(['tom', 'bob', 'nancy', 'dan', 'eric'], dtype='object')

In [8]:
ser['bob']

'foo'

In [9]:
ser[['bob', 'nancy']]

bob      foo
nancy    300
dtype: object

In [10]:
ser[[1,3]]

bob    foo
dan    400
dtype: object

In [11]:
ser.iloc[2]

300

In [12]:
'bob' in ser

True

In [13]:
ser * 2

666         200
bob      foofoo
nancy       600
dan         800
eric       1000
dtype: object

In [14]:
ser[['nancy', 'eric']] ** 2

nancy     90000
eric     250000
dtype: object

# pandas data frames

In [16]:
d = {'col_one': pd.Series([100,200,300], ['apple', 'orange', 'banana']),
    'col_two' : pd.Series([111,222,333,444], ['apple', 'ball', 'clock', 'water'])}

In [17]:
df = pd.DataFrame(d)
print(df)

        col_one  col_two
apple     100.0    111.0
ball        NaN    222.0
banana    300.0      NaN
clock       NaN    333.0
orange    200.0      NaN
water       NaN    444.0


In [18]:
df.index

Index(['apple', 'ball', 'banana', 'clock', 'orange', 'water'], dtype='object')

In [19]:
df.columns

Index(['col_one', 'col_two'], dtype='object')

In [20]:
pd.DataFrame(d, index = ['apple', 'ball', 'orange'])

Unnamed: 0,col_one,col_two
apple,100.0,111.0
ball,,222.0
orange,200.0,


In [22]:
pd.DataFrame(d, index = ['apple', 'ball', 'orange'], columns = ['col_two', 'five'])

Unnamed: 0,col_two,five
apple,111.0,
ball,222.0,
orange,,


## create data frame from list of python dict

In [23]:
data = [{'alex': 1, 'joe': 2}, {'ema': 5, 'dora': 10, 'alice': 20}]

In [25]:
df2 = pd.DataFrame(data)
print(df2)

   alex  joe  ema  dora  alice
0   1.0  2.0  NaN   NaN    NaN
1   NaN  NaN  5.0  10.0   20.0


In [27]:
df3 = pd.DataFrame(data, index=['row1', 'row2'])
df3

Unnamed: 0,alex,joe,ema,dora,alice
row1,1.0,2.0,,,
row2,,,5.0,10.0,20.0


In [30]:
df4 = pd.DataFrame(data, columns=['joe','dora', 'alice'])
df4

Unnamed: 0,joe,dora,alice
0,2.0,,
1,,10.0,20.0


In [36]:
df['col_one']
# df['apple']
'apple' in df

False

In [43]:
df['col_three'] = df['col_one'] + df['col_two']
df

Unnamed: 0,col_one,col_two,flag,col_three
apple,100.0,111.0,False,211.0
ball,,222.0,False,
banana,300.0,,False,
clock,,333.0,True,
orange,200.0,,False,
water,,444.0,True,


In [38]:
df['flag'] = df['col_two'] > 300
df

Unnamed: 0,col_one,col_two,col_three,flag
apple,100.0,111.0,211.0,False
ball,,222.0,,False
banana,300.0,,,False
clock,,333.0,,True
orange,200.0,,,False
water,,444.0,,True


In [44]:
three = df.pop('col_three')
three

apple     211.0
ball        NaN
banana      NaN
clock       NaN
orange      NaN
water       NaN
Name: col_three, dtype: float64

In [45]:
del df['flag']
df

Unnamed: 0,col_one,col_two
apple,100.0,111.0
ball,,222.0
banana,300.0,
clock,,333.0
orange,200.0,
water,,444.0


In [49]:
df.pop('copy_of_one')
df.insert(0, 'copy_of_one', df['col_one'])
df

Unnamed: 0,copy_of_one,col_one,col_two
apple,100.0,100.0,111.0
ball,,,222.0
banana,300.0,300.0,
clock,,,333.0
orange,200.0,200.0,
water,,,444.0


In [54]:
df['upper_one'] = df['col_one'][:2]
df

Unnamed: 0,copy_of_one,col_one,col_two,upper_one
apple,100.0,100.0,111.0,100.0
ball,,,222.0,
banana,300.0,300.0,,
clock,,,333.0,
orange,200.0,200.0,,
water,,,444.0,


# Case study

In [56]:
!cat ./movielens/movies.csv | wc -l

    9743


In [58]:
movies = pd.read_csv('./movielens/movies.csv', sep=',')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [59]:
tags = pd.read_csv('./movielens/tags.csv', sep=',')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [61]:
ratings = pd.read_csv('./movielens/ratings.csv', sep = ',', parse_dates=['timestamp'])
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [62]:
del ratings['timestamp']
del tags['timestamp']

In [67]:
row0 = tags.iloc[0]
print(row0)

userId         2
movieId    60756
tag        funny
Name: 0, dtype: object


In [75]:
tags.columns
# tags.index
row0['userId']
row0[0]

2

In [76]:
'apple' in row0

False

In [82]:
tags.head()
tags.index
tags.columns

Index(['userId', 'movieId', 'tag'], dtype='object')

In [84]:
tags.iloc[[0,11,200]]

Unnamed: 0,userId,movieId,tag
0,2,60756,funny
11,18,431,gangster
200,62,60074,bad script


In [85]:
ratings['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

# statistics

In [87]:
ratings['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [89]:
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [90]:
ratings['rating'].mean()

3.501556983616962

In [91]:
ratings.mean()

userId       326.127564
movieId    19435.295718
rating         3.501557
dtype: float64

In [92]:
ratings['rating'].min()

0.5

In [93]:
ratings['rating'].max()

5.0

In [94]:
ratings['rating'].std()

1.0425292390605359

In [95]:
ratings['rating'].mode()

0    4.0
dtype: float64

In [96]:
ratings.corr()

Unnamed: 0,userId,movieId,rating
userId,1.0,0.006773,-0.049348
movieId,0.006773,1.0,-0.004061
rating,-0.049348,-0.004061,1.0


In [99]:
filter1 = ratings['rating'] > 5
filter1.any()
type(filter1)

pandas.core.series.Series

In [100]:
filter2 = ratings['rating'] > 0
filter2.all()

True