In [1]:
import numpy as np
import pandas as pd
import altair as alt

In [2]:
d = {"a":5,"b":10,"chris":30}

In [3]:
d["b"]

10

In [4]:
e = {"a":[5,6,7],"b":10,"chris":[30,2.4,-20]}

In [5]:
df = pd.DataFrame(e)

In [6]:
df

Unnamed: 0,a,b,chris
0,5,10,30.0
1,6,10,2.4
2,7,10,-20.0


In [7]:
df.a

0    5
1    6
2    7
Name: a, dtype: int64

In [8]:
df["a"]

0    5
1    6
2    7
Name: a, dtype: int64

In [9]:
pd.Series(d)

a         5
b        10
chris    30
dtype: int64

In [10]:
df

Unnamed: 0,a,b,chris
0,5,10,30.0
1,6,10,2.4
2,7,10,-20.0


In [11]:
df.columns

Index(['a', 'b', 'chris'], dtype='object')

In [12]:
# These are strings, not bools
['true' for x in range(3)]

['true', 'true', 'true']

In [13]:
["true" for x in range(len(df))]

['true', 'true', 'true']

In [14]:
[true for x in range(len(df))]

NameError: name 'true' is not defined

In [15]:
[True for x in range(len(df))]

[True, True, True]

In [16]:
df["new column"] = [True for x in range(len(df))]

In [17]:
df

Unnamed: 0,a,b,chris,new column
0,5,10,30.0,True
1,6,10,2.4,True
2,7,10,-20.0,True


In [18]:
df["new column 2"] = False

In [19]:
df

Unnamed: 0,a,b,chris,new column,new column 2
0,5,10,30.0,True,False
1,6,10,2.4,True,False
2,7,10,-20.0,True,False


In [20]:
df.iloc[1,0] = 20
df

Unnamed: 0,a,b,chris,new column,new column 2
0,5,10,30.0,True,False
1,20,10,2.4,True,False
2,7,10,-20.0,True,False


In [21]:
df.loc[1,'a'] = 20
df

Unnamed: 0,a,b,chris,new column,new column 2
0,5,10,30.0,True,False
1,20,10,2.4,True,False
2,7,10,-20.0,True,False


In [22]:
df["a"]

0     5
1    20
2     7
Name: a, dtype: int64

In [23]:
df["a"].sort_values()

0     5
2     7
1    20
Name: a, dtype: int64

In [24]:
df["a"].sort_values(ascending=False)

1    20
2     7
0     5
Name: a, dtype: int64

In [25]:
df["a"].sort_values(ascending=False).iloc[1]

7

In [26]:
df["a"].sort_values().iloc[-2]

7

In [27]:
df

Unnamed: 0,a,b,chris,new column,new column 2
0,5,10,30.0,True,False
1,20,10,2.4,True,False
2,7,10,-20.0,True,False


In [28]:
df.median(axis=0)

a                7.0
b               10.0
chris            2.4
new column       1.0
new column 2     0.0
dtype: float64

In [29]:
A = np.array([[2,5,1],[3,1,10]])

In [30]:
A.reshape((-1))

array([ 2,  5,  1,  3,  1, 10])

In [31]:
df

Unnamed: 0,a,b,chris,new column,new column 2
0,5,10,30.0,True,False
1,20,10,2.4,True,False
2,7,10,-20.0,True,False


In [32]:
df.loc[1,::2]

a                  20
chris             2.4
new column 2    False
Name: 1, dtype: object

In [33]:
df.loc[1,::2] = [i**2 for i in range(3)]

In [34]:
df.loc[1,::2] = [0,1,4]

In [35]:
df

Unnamed: 0,a,b,chris,new column,new column 2
0,5,10,30.0,True,False
1,0,10,1.0,True,4
2,7,10,-20.0,True,False


In [36]:
df2 = df.iloc[:,:3]
df2

Unnamed: 0,a,b,chris
0,5,10,30.0
1,0,10,1.0
2,7,10,-20.0


In [37]:
df2.loc[1] = df2.loc[1]**2

In [38]:
df2

Unnamed: 0,a,b,chris
0,5,10,30.0
1,0,100,1.0
2,7,10,-20.0


In [39]:
df2**2

Unnamed: 0,a,b,chris
0,25,100,900.0
1,0,10000,1.0
2,49,100,400.0


In [40]:
df2.applymap(lambda x: x**2)

Unnamed: 0,a,b,chris
0,25,100,900.0
1,0,10000,1.0
2,49,100,400.0


In [41]:
df2

Unnamed: 0,a,b,chris
0,5,10,30.0
1,0,100,1.0
2,7,10,-20.0


In [42]:
df2.apply(lambda c: c.sum(), axis = 0)

a         12.0
b        120.0
chris     11.0
dtype: float64

In [43]:
df2.apply(lambda r: r.sum(), axis = 1)

0     45.0
1    101.0
2     -3.0
dtype: float64

In [44]:
df2.applymap(lambda a: a.sum())

AttributeError: 'int' object has no attribute 'sum'

In [45]:
a = 5.1
a.sum()

AttributeError: 'float' object has no attribute 'sum'

In [46]:
df = pd.read_csv("../data/spotify_dataset.csv", na_values=" ")

In [47]:
df.columns

Index(['Index', 'Highest Charting Position', 'Number of Times Charted',
       'Week of Highest Charting', 'Song Name', 'Streams', 'Artist',
       'Artist Followers', 'Song ID', 'Genre', 'Release Date', 'Weeks Charted',
       'Popularity', 'Danceability', 'Energy', 'Loudness', 'Speechiness',
       'Acousticness', 'Liveness', 'Tempo', 'Duration (ms)', 'Valence',
       'Chord'],
      dtype='object')

In [48]:
df.Genre

0                  ['indie rock italiano', 'italian pop']
1                                  ['australian hip hop']
2                                                 ['pop']
3                                       ['pop', 'uk pop']
4                           ['lgbtq+ hip hop', 'pop rap']
                              ...                        
1551                       ['dance pop', 'pop', 'uk pop']
1552             ['sertanejo', 'sertanejo universitario']
1553    ['dance pop', 'electropop', 'pop', 'post-teen ...
1554                       ['brega funk', 'funk carioca']
1555                             ['pop', 'post-teen pop']
Name: Genre, Length: 1556, dtype: object

In [49]:
df.Release Date

SyntaxError: invalid syntax (3791632194.py, line 1)

In [50]:
df["Release Date"]

0       2017-12-08
1       2021-07-09
2       2021-05-21
3       2021-06-25
4       2021-07-23
           ...    
1551    2017-06-02
1552    2019-10-11
1553    2018-01-12
1554    2019-09-25
1555    2019-11-13
Name: Release Date, Length: 1556, dtype: object

In [51]:
pd.to_datetime(df["Release Date"]).dt.day

0        8.0
1        9.0
2       21.0
3       25.0
4       23.0
        ... 
1551     2.0
1552    11.0
1553    12.0
1554    25.0
1555    13.0
Name: Release Date, Length: 1556, dtype: float64

In [52]:
year_series = pd.to_datetime(df["Release Date"]).dt.year

In [53]:
(year_series == 2019).sum()

181

In [54]:
df["Release Date"].map(lambda s: s[:4] == 2019)

TypeError: 'float' object is not subscriptable

In [55]:
np.nan[:4]

TypeError: 'float' object is not subscriptable

In [56]:
clean = df["Release Date"][~df["Release Date"].isna()]

In [57]:
type(clean)

pandas.core.series.Series

In [58]:
clean

0       2017-12-08
1       2021-07-09
2       2021-05-21
3       2021-06-25
4       2021-07-23
           ...    
1551    2017-06-02
1552    2019-10-11
1553    2018-01-12
1554    2019-09-25
1555    2019-11-13
Name: Release Date, Length: 1545, dtype: object

In [59]:
clean.map(lambda s: s[:4] == "2019").sum()

181

In [60]:
clean.map(lambda s: int(s[:4]) == 2019).sum()

181