In [16]:
import pandas as pd
from datetime import date, timedelta
# An Index and a ndarray assembles a Series

# A couple of Series that sharing the same Index can then form a DataFrame

In [2]:
# An Index is like a combination of tuple and set. So 'immutable'

prime_indices = pd.Index([2, 3, 5, 7, 11, 13, 17, 19, 23, 29])

# Index has the characteristics of a set
odd_indices = pd.Index(range(1, 30, 2))
print(prime_indices.intersection(odd_indices))         # prime_indices & odd_indices
print(prime_indices.union(odd_indices))                # prime_indices | odd_indices
print(prime_indices.symmetric_difference(odd_indices)) # prime_indices ^ odd_indices
print(prime_indices.difference(odd_indices))
print(odd_indices.difference(prime_indices))

In [3]:
prime_series = pd.Series([2, 3, 5, 7, 11, 13, 17, 19, 23, 29])
print(type(prime_series.index))
print(type(prime_series.values))

<class 'pandas.core.indexes.range.RangeIndex'>
<class 'numpy.ndarray'>


In [5]:
movie_df = pd.DataFrame()
movie_df["title"] = ["The Shawshank Redemption", "The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
movie_df["imdb_rating"] = [9.3, 9.0, 8.9, 8.8, 8.7]

print(type(movie_df.index))
print(type(movie_df["title"]))

<class 'pandas.core.indexes.range.RangeIndex'>
<class 'pandas.core.series.Series'>


In [19]:
def get_latest_daily_report():    
    today = date.today()
    day_delta = timedelta(days=1)
    data_date = today
    while True:
        data_date_str = date.strftime(data_date, '%m-%d-%Y')
        print("Try importing {} data...".format(data_date_str))
        daily_report_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{}.csv".format(data_date_str)
        try:
            daily_report = pd.read_csv(daily_report_url)
            print("Successfully imported {} data!".format(data_date_str))
            break
        except HTTPError:
            data_date -= day_delta
    return daily_report

daily_report = get_latest_daily_report()

Try importing 05-07-2021 data...


NameError: name 'HTTPError' is not defined

另外，pandas支援import各種檔案格式，包含excel, sql, json, html...

In [18]:
request_url = "https://www.imdb.com/chart/top"
html_tables = pd.read_html(request_url)
print(type(html_tables))

html_tables[0]

<class 'list'>


Unnamed: 0.1,Unnamed: 0,Rank & Title,IMDb Rating,Your Rating,Unnamed: 4
0,,1. 刺激1995 (1994),9.2,12345678910 NOT YET RELEASED Seen,
1,,2. 教父 (1972),9.1,12345678910 NOT YET RELEASED Seen,
2,,3. 教父第二集 (1974),9.0,12345678910 NOT YET RELEASED Seen,
3,,4. 黑暗騎士 (2008),9.0,12345678910 NOT YET RELEASED Seen,
4,,5. 十二怒漢 (1957),8.9,12345678910 NOT YET RELEASED Seen,
...,...,...,...,...,...
245,,246. 誤殺瞞天記 (2015),8.0,12345678910 NOT YET RELEASED Seen,
246,,247. 阿爾及爾之戰 (1966),8.0,12345678910 NOT YET RELEASED Seen,
247,,248. Ratsasan (2018),8.0,12345678910 NOT YET RELEASED Seen,
248,,249. 故土 (2004),8.0,12345678910 NOT YET RELEASED Seen,


In [None]:
print(daily_report.shape)
print(daily_report.dtypes)
print(daily_report.index)
print(daily_report.columns)

daily_report.head()
# tail(), describe(), info(), set_index('list of keys to replace current index')
# reset_index(), 

In [None]:
print(daily_report['Country_Region']) # Selecting a column as Series

daily_report[['Country_Region']] # Selecting a column as df !!

cols = ['Country_Region', 'Province_State']
daily_report[cols] ## df with 2 var

is_taiwan = daily_report['Country_Region'] == 'Taiwan*'
daily_report[is_taiwan]

# Subsetting columns and rows simultaneously
cols_to_select = ['Country_Region', 'Confirmed']
rows_to_filter = daily_report['Country_Region'] == 'Taiwan*'
daily_report[rows_to_filter][cols_to_select]

In [None]:
# loc[] is indexing DataFrame with Index

taiwan_row_index = daily_report[is_taiwan].index[0]
print(daily_report.loc[taiwan_row_index, ['Country_Region', 'Confirmed']]) # as Series
daily_report.loc[[taiwan_row_index], ['Country_Region', 'Confirmed']] # as DataFrame

# iloc[] is indexing DataFrame with absolute position

print(daily_report.iloc[taiwan_row_index, [3, 7]]) # as Series
daily_report.iloc[[taiwan_row_index], [3, 7]] # as DataFrame

daily_report.sort_values(['Country_Region', 'Confirmed'])

daily_report.sort_index(ascending=False)

In [None]:
# Deriving categorical from numerical with pd.cut
import numpy as np

cut_bins = [0, 1000, 10000, 100000, np.Inf]
cut_labels = ['Less than 1000', 'Between 1000 and 10000', 'Between 10000 and 100000', 'Above 100000']
confirmed_categorical = pd.cut(daily_report['Confirmed'], bins=cut_bins, labels=cut_labels, right=False)
print(confirmed_categorical)

# Deriving categorical from categorical with map
country_name = {
    'Taiwan*': 'Taiwan'
}
daily_report_tw = daily_report[is_taiwan]
daily_report_tw['Country_Region'].map(country_name)
# 法二: Passing a function (recommended)
def is_us(x):
    if x == 'US':
        return 'US'
    else:
        return 'Not US'
daily_report['Country_Region'].map(is_us)
# Passing a lambda expression)
daily_report['Country_Region'].map(lambda x: 'US' if x == 'US' else 'Not US')

In [None]:
daily_report['Confirmed'].sum()
daily_report.groupby('Country_Region')['Confirmed'].sum() # !!

In [None]:
print(daily_report['Province_State'].size)
print(daily_report['Province_State'].isnull().sum())
print(daily_report['Province_State'].notnull().sum())

print(daily_report.dropna().shape)
print(daily_report['FIPS'].fillna(0))

print(daily_report['Country_Region'].nunique())
print(daily_report['Country_Region'].unique())

In [None]:
daily_report['Country_Region'].value_counts().sort_values(ascending=False) # !!

In [None]:
# Splitting strings with str.split as a Series

html_table['Rank & Title'].str.split()
split_dataframe = html_table['Rank & Title'].str.split(expand=True)
ranks = split_dataframe[0].str.replace(".", '') # '0' means the first column

years = split_dataframe[2].str.replace("\(|\)", '') # delete parentheses

print(daily_report['Country_Region'].str.contains('land').sum())
daily_report[daily_report['Country_Region'].str.contains('land')]

In [21]:
ts_confirmed_global_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
ts_confirmed_global = pd.read_csv(ts_confirmed_global_url)
ts_confirmed_global

idVars = ['Province/State', 'Country/Region', 'Lat', 'Long']
ts_confirmed_global_long = pd.melt(ts_confirmed_global,
                                   id_vars=idVars,
                                   var_name='Date',
                                   value_name='Confirmed')
ts_confirmed_global_long

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed
0,,Afghanistan,33.939110,67.709953,1/22/20,0
1,,Albania,41.153300,20.168300,1/22/20,0
2,,Algeria,28.033900,1.659600,1/22/20,0
3,,Andorra,42.506300,1.521800,1/22/20,0
4,,Angola,-11.202700,17.873900,1/22/20,0
...,...,...,...,...,...,...
129245,,Vietnam,14.058324,108.277199,5/5/21,3030
129246,,West Bank and Gaza,31.952200,35.233200,5/5/21,299736
129247,,Yemen,15.552727,48.516388,5/5/21,6414
129248,,Zambia,-13.133897,27.849332,5/5/21,91849


In [None]:
def get_top_rated_movies():
    request_url = "https://www.imdb.com/chart/top"
    html_tables = pd.read_html(request_url)
    html_table = html_tables[0]
    split_dataframe = html_table['Rank & Title'].str.split(expand=True)
    ranks = split_dataframe[0].str.replace(".", '').values
    years = split_dataframe[2].str.replace("\(|\)", '').values
    titles = split_dataframe[1].values
    ratings = html_table['IMDb Rating'].values
    out = pd.DataFrame()
    out['rank'] = ranks
    out['title'] = titles
    out['year'] = years
    out['rating'] = ratings
    return out

top_rated_movies = get_top_rated_movies()

In [None]:
left_df = top_rated_movies.loc[:9, ['title', 'year']]
right_df = top_rated_movies[top_rated_movies['title'].str.contains('魔戒')][['title', 'rating']].reset_index(drop=True)
left_df

# default: inner join
pd.merge(left_df, right_df)
pd.merge(left_df, right_df, how='left') # or right join

In [None]:
# join dataframes on index 這邊開始混沌 = =
left_df.join(right_df, lsuffix='_x', rsuffix='_y')

left_df = left_df.set_index('title')
right_df = right_df.set_index('title')
# default: left join
left_df.join(right_df)

left_df.join(right_df, how='inner')