In [77]:
import pandas as pd
import numpy as np
from datetime import datetime

### read from an Excel file

In [7]:
file_name_string = "file_path\\wc-20140609-140000.csv"
world_cup_prediction = pd.read_csv(file_name_string)
#world_cup_prediction
world_cup_prediction.head(10)      #Returns first 10 rows of data.

Unnamed: 0,country,country_id,group,spi,spi_offense,spi_defense,win_group,sixteen,quarter,semi,cup,win
0,Algeria,ALG,h,63.43,1.1208,1.1636,0.0631,0.2032,0.038517,0.007996,0.001021,0.000126
1,Argentina,ARG,f,90.0,2.8541,0.4494,0.735,0.9279,0.669904,0.468159,0.281758,0.127799
2,Australia,AUS,b,69.45,1.6395,1.2349,0.0151,0.0762,0.009646,0.002943,0.000671,9.3e-05
3,Belgium,BEL,h,81.97,2.1373,0.741,0.4781,0.7688,0.351536,0.148459,0.054136,0.014904
4,Bosnia and Herzegovina,BIH,f,80.31,2.3113,0.9861,0.1599,0.5589,0.26195,0.112098,0.031611,0.008964
5,Brazil,BRA,a,91.83,3.3686,0.5175,0.95,0.994,0.804479,0.69759,0.565823,0.453437
6,Chile,CHI,b,86.72,2.7113,0.7043,0.3392,0.6951,0.324846,0.205733,0.110337,0.041899
7,Ivory Coast,CIV,c,78.9,2.2467,1.048,0.2317,0.5287,0.243748,0.067887,0.025096,0.006044
8,Cameroon,CMR,a,71.33,1.4798,0.9904,0.0079,0.2272,0.050187,0.016983,0.00438,0.000585
9,Colombia,COL,c,85.82,2.1813,0.4528,0.5156,0.7794,0.457763,0.149779,0.073364,0.02803


### Grouping
one or more of the following steps:
- Splitting the data into groups based on some criteria
- Applying a function to each group independently
- Combining the results into a data structure

### group by country

In [10]:
world_cup_prediction.groupby('country').sum().head(10)

Unnamed: 0_level_0,spi,spi_offense,spi_defense,win_group,sixteen,quarter,semi,cup,win
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Algeria,63.43,1.1208,1.1636,0.0631,0.2032,0.038517,0.007996,0.001021,0.000126
Argentina,90.0,2.8541,0.4494,0.735,0.9279,0.669904,0.468159,0.281758,0.127799
Australia,69.45,1.6395,1.2349,0.0151,0.0762,0.009646,0.002943,0.000671,9.3e-05
Belgium,81.97,2.1373,0.741,0.4781,0.7688,0.351536,0.148459,0.054136,0.014904
Bosnia and Herzegovina,80.31,2.3113,0.9861,0.1599,0.5589,0.26195,0.112098,0.031611,0.008964
Brazil,91.83,3.3686,0.5175,0.95,0.994,0.804479,0.69759,0.565823,0.453437
Cameroon,71.33,1.4798,0.9904,0.0079,0.2272,0.050187,0.016983,0.00438,0.000585
Chile,86.72,2.7113,0.7043,0.3392,0.6951,0.324846,0.205733,0.110337,0.041899
Colombia,85.82,2.1813,0.4528,0.5156,0.7794,0.457763,0.149779,0.073364,0.02803
Costa Rica,74.07,1.3227,0.6993,0.0945,0.2622,0.10005,0.018858,0.005468,0.001069


### Categorical Data

Categoricals are a pandas data type, which correspond to categorical variables in statistics: a variable, which can take
on only a limited, and usually fixed, number of possible values (categories; levels in R). Examples are gender, social
class, blood types, country affiliations, observation time or ratings via Likert scales.

In contrast to statistical categorical variables, categorical data might have an order (e.g. ‘strongly agree’ vs ‘agree’ or
‘first observation’ vs. ‘second observation’), but numerical operations (additions, divisions, ...) are not possible.

All values of categorical data are either in categories or np.nan. Order is defined by the order of categories, not lexical
order of the values.

documentation: http://pandas.pydata.org/pandas-docs/stable/categorical.html

In [12]:
world_cup_prediction['country'] # this is categorical data.

0                    Algeria
1                  Argentina
2                  Australia
3                    Belgium
4     Bosnia and Herzegovina
5                     Brazil
6                      Chile
7                Ivory Coast
8                   Cameroon
9                   Colombia
10                Costa Rica
11                   Croatia
12                   Ecuador
13                   England
14                     Spain
15                    France
16                   Germany
17                     Ghana
18                    Greece
19                  Honduras
20                      Iran
21                     Italy
22                     Japan
23               South Korea
24                    Mexico
25               Netherlands
26                   Nigeria
27                  Portugal
28                    Russia
29               Switzerland
30                   Uruguay
31                       USA
Name: country, dtype: object

In [16]:
#world_cup_prediction.groupby('country').count()

### Resampling
documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.resample.html

For arguments to 'freq' parameter, please see [Offset Aliases](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases)

Resample time-series data.

Convenience method for frequency conversion and resampling of time series. Object must have a datetime-like index (DatetimeIndex, PeriodIndex, or TimedeltaIndex), or pass datetime-like values to the on or level keyword.

In [18]:
# min: minutes
my_index = pd.date_range('9/1/2016', periods=9, freq='min')

In [19]:
my_index

DatetimeIndex(['2016-09-01 00:00:00', '2016-09-01 00:01:00',
               '2016-09-01 00:02:00', '2016-09-01 00:03:00',
               '2016-09-01 00:04:00', '2016-09-01 00:05:00',
               '2016-09-01 00:06:00', '2016-09-01 00:07:00',
               '2016-09-01 00:08:00'],
              dtype='datetime64[ns]', freq='T')

create a time series that includes a simple pattern

In [20]:
my_series = pd.Series(np.arange(9), index=my_index)

In [21]:
my_series

2016-09-01 00:00:00    0
2016-09-01 00:01:00    1
2016-09-01 00:02:00    2
2016-09-01 00:03:00    3
2016-09-01 00:04:00    4
2016-09-01 00:05:00    5
2016-09-01 00:06:00    6
2016-09-01 00:07:00    7
2016-09-01 00:08:00    8
Freq: T, dtype: int32

Downsample the series into 3 minute bins and sum the values of the timestamps falling into a bin

In [35]:
my_series.resample('3min').sum()

# There series is divided in interval of 3 minutes. so,
# 0 + 1 + 2 =3
# 3 + 4 + 5 = 12
# 6 + 7 + 8 = 21

2016-09-01 00:00:00     3
2016-09-01 00:03:00    12
2016-09-01 00:06:00    21
Freq: 3T, dtype: int32

Downsample the series into 3 minute bins as above, but label each bin using the right edge instead of the left

Notice the difference in the time indices; the sum in each bin is the same

In [36]:
my_series.resample('3min', label='right').sum()

2016-09-01 00:03:00     3
2016-09-01 00:06:00    12
2016-09-01 00:09:00    21
Freq: 3T, dtype: int32

Downsample the series into 3 minute bins as above, but close the right side of the bin interval

"count backwards" from end of time series

In [37]:
my_series.resample('3min', label='right', closed='right').sum()

2016-09-01 00:00:00     0
2016-09-01 00:03:00     6
2016-09-01 00:06:00    15
2016-09-01 00:09:00    15
Freq: 3T, dtype: int32

Upsample the series into 30 second bins

[asfreq()](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.asfreq.html)

In [38]:
#select first 5 rows 
my_series.resample('30S').asfreq()[0:5] 

2016-09-01 00:00:00    0.0
2016-09-01 00:00:30    NaN
2016-09-01 00:01:00    1.0
2016-09-01 00:01:30    NaN
2016-09-01 00:02:00    2.0
Freq: 30S, dtype: float64

##### define a custom function to use with resampling

In [39]:
def custom_arithmetic(array_like):
    temp = 3 * np.sum(array_like) + 5
    return temp

##### apply custom resampling function

In [41]:
my_series.resample('3min').apply(custom_arithmetic)

2016-09-01 00:00:00    14
2016-09-01 00:03:00    41
2016-09-01 00:06:00    68
Freq: 3T, dtype: int32

### Series
Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers,
Python objects, etc.). The axis labels are collectively referred to as the index.

documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html

##### Create series from NumPy array
number of labels in 'index' must be the same as the number of elements in array

In [43]:
my_simple_series = pd.Series(np.random.randn(8), index=['a', 'b', 'c', 'd', 'e','f','g','h'])
my_simple_series

a   -0.091488
b    0.293198
c   -0.868758
d    0.526521
e   -1.440553
f    0.959056
g    0.805063
h   -0.614099
dtype: float64

In [44]:
my_simple_series.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'], dtype='object')

##### Create series from NumPy array, without explicit index

In [45]:
my_simple_series = pd.Series(np.random.randn(5))
my_simple_series

0   -0.390853
1   -1.103981
2    0.604553
3   -1.673487
4    1.450193
dtype: float64

Access a series like a NumPy array

In [46]:
my_simple_series[:3]

0   -0.390853
1   -1.103981
2    0.604553
dtype: float64

##### Create series from Python dictionary

In [48]:
my_dictionary = {'a' : 45., 'b' : -19.5, 'c' : 4444}
my_second_series = pd.Series(my_dictionary)
my_second_series

a      45.0
b     -19.5
c    4444.0
dtype: float64

Access a series like a dictionary

In [49]:
my_second_series['b']

-19.5

note order in display; same as order in "index"

note NaN

In [50]:
pd.Series(my_dictionary, index=['b', 'c', 'd', 'a'])

b     -19.5
c    4444.0
d       NaN
a      45.0
dtype: float64

In [51]:
my_second_series.get('a')

45.0

In [52]:
unknown = my_second_series.get('f')
type(unknown)

NoneType

##### Create series from scalar
If data is a scalar value, an index must be provided. The value will be repeated to match the length of index

In [54]:
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

<h2> Vectorized operations </h2>

In [56]:
my_dictionary = {'a' : 45., 'b' : -19.5, 'c' : 4444}
my_series = pd.Series(my_dictionary)
my_series

a      45.0
b     -19.5
c    4444.0
dtype: float64

###### add Series without loop

In [57]:
my_series + my_series

a      90.0
b     -39.0
c    8888.0
dtype: float64

In [58]:
my_series

a      45.0
b     -19.5
c    4444.0
dtype: float64

##### Series within arithmetic expression

In [59]:
my_series + 5

a      50.0
b     -14.5
c    4449.0
dtype: float64

##### Series used as argument to NumPy function

In [61]:
np.exp(my_series)

a    3.493427e+19
b    3.398268e-09
c             inf
dtype: float64

A key difference between Series and ndarray is that operations between Series automatically align the data based on
label. Thus, you can write computations without giving consideration to whether the Series involved have the same labels.

In [63]:
my_series[1:]

b     -19.5
c    4444.0
dtype: float64

In [64]:
my_series[:-1]

a    45.0
b   -19.5
dtype: float64

In [65]:
my_series[1:] + my_series[:-1]

a     NaN
b   -39.0
c     NaN
dtype: float64

### Apply Python functions on an element-by-element basis

In [71]:
def multiply_by_ten (input_element):
    return input_element * 10.0

In [72]:
my_series.map(multiply_by_ten)

a      450.0
b     -195.0
c    44440.0
dtype: float64

### Vectorized string methods
Series is equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. 

In [75]:
series_of_strings = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [76]:
series_of_strings.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

### Date Arithmetic
documentation: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#timeseries-offsets

| Type      |   | Description                                                       |
|-----------|---|-------------------------------------------------------------------|
| date      |   | Store calendar date (year, month, day) using a Gregorian Calendar |
| datetime  |   | Store both date and time                                          |
| timedelta |   | Difference between two datetime values                            |

##### common date arithmetic operations
- calculate differences between date
- generate sequences of dates and time spans
- convert time series to a particular frequency

### Date, time, functions
documentation: http://pandas.pydata.org/pandas-docs/stable/api.html#top-level-dealing-with-datetimelike

| to_datetime(*args, **kwargs)                      | Convert argument to datetime.                                               |   |
|---------------------------------------------------|-----------------------------------------------------------------------------|---|
| to_timedelta(*args, **kwargs)                     | Convert argument to timedelta                                               |   |
| date_range([start, end, periods, freq, tz, ...])  | Return a fixed frequency datetime index, with day (calendar) as the default |   |
| bdate_range([start, end, periods, freq, tz, ...]) | Return a fixed frequency datetime index, with business day as the default   |   |
| period_range([start, end, periods, freq, name])   | Return a fixed frequency datetime index, with day (calendar) as the default |   |
| timedelta_range([start, end, periods, freq, ...]) | Return a fixed frequency timedelta index, with day as the default           |   |
| infer_freq(index[, warn])                         | Infer the most likely frequency given the input index.                      |   |

In [79]:
now = datetime.now()
now

datetime.datetime(2019, 2, 15, 11, 24, 0, 109573)

In [80]:
now.year, now.month, now.day

(2019, 2, 15)

##### delta
source: http://pandas.pydata.org/pandas-docs/stable/timedeltas.html

In [81]:
delta = now - datetime(2001, 1, 1)
delta

datetime.timedelta(6619, 41040, 109573)

In [82]:
delta.days

6619

### Parsing Timedelta
##### from string

In [85]:
pd.Timedelta('4 days 10.15 hours')

Timedelta('4 days 10:09:00')

##### named keyword arguments

In [87]:
# note: these MUST be specified as keyword arguments
pd.Timedelta(days=1, seconds=1)

Timedelta('1 days 00:00:01')

##### integers with a unit

In [88]:
pd.Timedelta(1, unit='d')

Timedelta('1 days 00:00:00')

##### create a range of dates from Timedelta

In [91]:
Indian_independence_day = datetime(2019, 8, 15)
print(Indian_independence_day)
Indian_republic_day = datetime(2019, 1, 26)
print(Indian_republic_day)
summer_time = Indian_independence_day - Indian_republic_day
print(summer_time)
type(summer_time)

2019-08-15 00:00:00
2019-01-26 00:00:00
201 days, 0:00:00


datetime.timedelta

In [92]:
indian_summer_time_range = pd.date_range(Indian_republic_day, periods=summer_time.days, freq='D')

In [93]:
indian_summer_time_range

DatetimeIndex(['2019-01-26', '2019-01-27', '2019-01-28', '2019-01-29',
               '2019-01-30', '2019-01-31', '2019-02-01', '2019-02-02',
               '2019-02-03', '2019-02-04',
               ...
               '2019-08-05', '2019-08-06', '2019-08-07', '2019-08-08',
               '2019-08-09', '2019-08-10', '2019-08-11', '2019-08-12',
               '2019-08-13', '2019-08-14'],
              dtype='datetime64[ns]', length=201, freq='D')

##### summer_time time series with random data

In [97]:
indian_summer_time_series = pd.Series(np.random.rand(summer_time.days), index=indian_summer_time_range)
indian_summer_time_series.tail()

2019-08-10    0.550405
2019-08-11    0.736088
2019-08-12    0.095968
2019-08-13    0.015928
2019-08-14    0.028460
Freq: D, dtype: float64