# pandas Foundations

##### pandas DataFrames are the most widely used in-memory representation of complex data collections within Python

## Data Ingestion and Inspection

pandas DataFrame is a tabular data structure with labeled rows (index - tabled list of labels that permit fast look-up and relational operations) and columns  

In [3]:
import pandas as pd
df = pd.read_csv("./data/world_ind_pop_data.csv", index_col=0)

In [None]:
print(type(df), "\n")
print(df.shape, "\n")                # shape is an attribute of the dataframe object
print(df.columns, "\n")              # displays the labels of every column, this attribute is an index
print(type(df.columns), "\n")

##### DataFrame slicing

In [None]:
df_slice1 = df.iloc[:5, :]      # returns the first five rows (index 0 - 4) and all columns
print(df_slice1, "\n")
df_slice2 = df.iloc[-5:, :]      # returns the last five rows and all columns
print(df_slice2)

In [None]:
df.head()

In [None]:
df.tail()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13374 entries, Arab World to Zimbabwe
Data columns (total 4 columns):
CountryCode                      13374 non-null object
Year                             13374 non-null int64
Total Population                 13374 non-null float64
Urban population (% of total)    13374 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 522.4+ KB


#### pandas DataFrames are a collection of columns where each column is a pandas Series

In [15]:
sr_total_population = df["Total Population"]
print(sr_total_population, "\n")                              # when you grab a single column from a dataframe a pandas series is returned

df_total_population = df[["Total Population"]]                # note that this is a pandas dataframe however

CountryName
Arab World                                 9.249590e+07
Caribbean small states                     4.190810e+06
Central Europe and the Baltics             9.140158e+07
East Asia & Pacific (all income levels)    1.042475e+09
East Asia & Pacific (developing only)      8.964930e+08
                                               ...     
Virgin Islands (U.S.)                      1.041700e+05
West Bank and Gaza                         4.294682e+06
Yemen, Rep.                                2.618368e+07
Zambia                                     1.572134e+07
Zimbabwe                                   1.524586e+07
Name: Total Population, Length: 13374, dtype: float64 



In [16]:
total_population_values = sr_total_population.values          # a numpy array is returned

In [21]:
import numpy as np
sr_log10_total_population = np.log10(sr_total_population)     # we are applying log10 operation on all values in a pandas series
df_log10_total_population = np.log10(df_total_population)     # we are applying log10 operation on all values in a pandas dataframe

In [30]:
object_types = [x + " has type " + str(type(eval(x))) for x in ["sr_total_population", "df_total_population", "total_population_values", "sr_log10_total_population", "df_log10_total_population"]]
for ot in object_types:
    print(ot)

sr_total_population has type <class 'pandas.core.series.Series'>
df_total_population has type <class 'pandas.core.frame.DataFrame'>
total_population_values has type <class 'numpy.ndarray'>
sr_log10_total_population has type <class 'pandas.core.series.Series'>
df_log10_total_population has type <class 'pandas.core.frame.DataFrame'>


#### Constructing a pandas DataFrame from scratch

In [39]:
list_keys = ["Country", "Medals"]                                           # this is a list of the column labels of the dataframe we are going to create
list_countries = ["United States", "Soviet Union", "United Kingdom"]        # this is a list of values for the "Country" column
list_medals = [1118, 473, 273]                                              # this is a list of values for the "Medals" column
list_values = [list_countries, list_medals]

print(list_keys, "\n")
print(list_values, "\n")

# create a zip object (this is a generator)
data_zip = zip(list_keys, list_values)
print(data_zip, "\n")

# convert a zip object into a list
data_zip_list = list(data_zip)                # this is a list of key-value tuples
print(data_zip_list, "\n")

# covert the list of tuples into a dictionary
data_dict = dict(data_zip_list)
print(data_dict, "\n")

# create a dataframe from the dict object
df = pd.DataFrame(data)
print(df)

['Country', 'Medals'] 

[['United States', 'Soviet Union', 'United Kingdom'], [1118, 473, 273]] 

<zip object at 0x00000223AC8582C8> 

[('Country', ['United States', 'Soviet Union', 'United Kingdom']), ('Medals', [1118, 473, 273])] 

{'Country': ['United States', 'Soviet Union', 'United Kingdom'], 'Medals': [1118, 473, 273]} 

          Country  Medals
0   United States    1118
1    Soviet Union     473
2  United Kingdom     273


In [40]:
# renaming the columns in a dataframe
print(df, "\n")
new_column_nanmes = ["countries", "medals"]
df.columns = new_column_nanmes
print(df)

          Country  Medals
0   United States    1118
1    Soviet Union     473
2  United Kingdom     273 

        countries  medals
0   United States    1118
1    Soviet Union     473
2  United Kingdom     273


In [41]:
# broadcasting (adding a column to a dataframe)
df["year"] = 2018
print(df)

        countries  medals  year
0   United States    1118  2018
1    Soviet Union     473  2018
2  United Kingdom     273  2018


In [45]:
file_messy = "./data/messy_stock_data.tsv"
file_clean_csv = "./data/file_clean.csv"
file_clean_excel = "./data/file_clean.xlsx"

df1 = pd.read_csv(file_messy)
print(df1.head())

df2 = pd.read_csv(file_messy, delimiter=" ", header=3, comment="#")
print(df2.head())

df2.to_csv(file_clean_csv, index=False)
df2.to_excel(file_clean_excel, index=False)

                                                   The following stock data was collect on 2016-AUG-25 from an unknown source
These kind of ocmments are not very useful                                                  are they?                        
probably should just throw this line away too          but not the next since those are column labels                        
name Jan Feb Mar Apr May Jun Jul Aug Sep Oct No...                                                NaN                        
# So that line you just read has all the column...                                                NaN                        
IBM 156.08 160.01 159.81 165.22 172.25 167.15 1...                                                NaN                        
     name     Jan     Feb     Mar     Apr     May     Jun     Jul     Aug  \
0     IBM  156.08  160.01  159.81  165.22  172.25  167.15  164.75  152.77   
1    MSFT   45.51   43.08   42.13   43.47   47.53   45.96   45.61   45.51   
2  GOOGLE  51

## Time Series in pandas

In [46]:
temperature_list = [50.2, 50.3, 49.8]
date_list = ["20100112 06:00", "20100112 07:00", "20100112 08:00"]

time_format  = "%Y-%m-%d %H:%M"
my_datetimes = pd.to_datetime(date_list, format=time_format)

time_series = pd.Series(temperature_list, index=my_datetimes)

print(time_series)

2010-01-12 06:00:00    50.2
2010-01-12 07:00:00    50.3
2010-01-12 08:00:00    49.8
dtype: float64


In [47]:
import pandas as pd
stock_file = "./data/tech_stocks_monthly_data.csv"
stock_file_column_names = ["ticker", "date", "close_price", "market_cap", "volume"]
stocks = pd.read_csv(stock_file, header=0, names=stock_file_column_names, index_col="date", parse_dates=True)

In [49]:
stocks.info()      # the stocks dataframe has a DatetimeIndex

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 18270 entries, 1990-12-31 to 2019-11-29
Data columns (total 4 columns):
ticker         18270 non-null object
close_price    17878 non-null float64
market_cap     18270 non-null object
volume         18270 non-null object
dtypes: float64(1), object(3)
memory usage: 713.7+ KB


In [53]:
# slice on a DatetimeIndex
stocks_1992 = stocks["1992"]
print(stocks_1992)

stocks_1998_first_half = stocks["1998-01":"1998-06"]
print(stocks_1998_first_half)

stocks_1998_first_half2 = stocks.loc["1998-01":"1998-06"]
print(stocks_1998_first_half2)

           ticker  close_price  market_cap        volume
date                                                    
1992-01-31   MXIM       1.0439      351.3    49,438,400 
1992-02-28   MXIM       1.0866      373.5    16,766,400 
1992-03-31   MXIM       0.8351      287.1    38,395,200 
1992-04-30   MXIM       1.0249      352.3    32,465,600 
1992-05-29   MXIM       0.9015      315.8    25,286,400 
...           ...          ...         ...           ...
1992-08-31    MSI      18.7677   23,801.0    15,447,767 
1992-09-30    MSI      19.3319   24,463.1    16,894,109 
1992-10-30    MSI      21.1494   26,763.0    24,579,593 
1992-11-30    MSI      22.6640   28,679.6    17,979,766 
1992-12-31    MSI      23.0703   29,132.7    15,764,052 

[392 rows x 4 columns]
           ticker  close_price market_cap         volume
date                                                    
1998-01-30   MXIM      10.5146   4,501.7     69,706,200 
1998-02-27   MXIM      12.2608   5,286.9     53,564,400 
1998-03

In [79]:
ts1 = pd.read_csv("./data/ts1.csv", header=None, parse_dates=True, index_col=0)
ts2 = pd.read_csv("./data/ts2.csv", header=None, parse_dates=True, index_col=0)

In [80]:
print(ts1)
print(ts2)

             1
0             
2016-07-01   0
2016-07-02   1
2016-07-03   2
2016-07-04   3
2016-07-05   4
2016-07-06   5
2016-07-07   6
2016-07-08   7
2016-07-09   8
2016-07-10   9
2016-07-11  10
2016-07-12  11
2016-07-13  12
2016-07-14  13
2016-07-15  14
2016-07-16  15
2016-07-17  16
             1
0             
2016-07-01   0
2016-07-04   1
2016-07-05   2
2016-07-06   3
2016-07-07   4
2016-07-08   5
2016-07-11   6
2016-07-12   7
2016-07-13   8
2016-07-14   9
2016-07-15  10


In [88]:
ts3 = ts2.reindex(ts1.index)
ts4 = ts2.reindex(ts1.index, method="ffill")
ts5 = ts2.reindex(ts1.index, method="bfill")

In [89]:
print(ts3, "\n")
print(ts4, "\n")
print(ts5, "\n")

               1
0               
2016-07-01   0.0
2016-07-02   NaN
2016-07-03   NaN
2016-07-04   1.0
2016-07-05   2.0
2016-07-06   3.0
2016-07-07   4.0
2016-07-08   5.0
2016-07-09   NaN
2016-07-10   NaN
2016-07-11   6.0
2016-07-12   7.0
2016-07-13   8.0
2016-07-14   9.0
2016-07-15  10.0
2016-07-16   NaN
2016-07-17   NaN 

             1
0             
2016-07-01   0
2016-07-02   0
2016-07-03   0
2016-07-04   1
2016-07-05   2
2016-07-06   3
2016-07-07   4
2016-07-08   5
2016-07-09   5
2016-07-10   5
2016-07-11   6
2016-07-12   7
2016-07-13   8
2016-07-14   9
2016-07-15  10
2016-07-16  10
2016-07-17  10 

               1
0               
2016-07-01   0.0
2016-07-02   1.0
2016-07-03   1.0
2016-07-04   1.0
2016-07-05   2.0
2016-07-06   3.0
2016-07-07   4.0
2016-07-08   5.0
2016-07-09   6.0
2016-07-10   6.0
2016-07-11   6.0
2016-07-12   7.0
2016-07-13   8.0
2016-07-14   9.0
2016-07-15  10.0
2016-07-16   NaN
2016-07-17   NaN 



In [90]:

print(sum12)

               1
0               
2016-07-01   0.0
2016-07-02   NaN
2016-07-03   NaN
2016-07-04   4.0
2016-07-05   6.0
2016-07-06   8.0
2016-07-07  10.0
2016-07-08  12.0
2016-07-09   NaN
2016-07-10   NaN
2016-07-11  16.0
2016-07-12  18.0
2016-07-13  20.0
2016-07-14  22.0
2016-07-15  24.0
2016-07-16   NaN
2016-07-17   NaN


In [91]:
sum13 = ts1 + ts3
print(sum13)

               1
0               
2016-07-01   0.0
2016-07-02   NaN
2016-07-03   NaN
2016-07-04   4.0
2016-07-05   6.0
2016-07-06   8.0
2016-07-07  10.0
2016-07-08  12.0
2016-07-09   NaN
2016-07-10   NaN
2016-07-11  16.0
2016-07-12  18.0
2016-07-13  20.0
2016-07-14  22.0
2016-07-15  24.0
2016-07-16   NaN
2016-07-17   NaN


In [93]:
sum14 = ts1 + ts4
print(sum14)

             1
0             
2016-07-01   0
2016-07-02   1
2016-07-03   2
2016-07-04   4
2016-07-05   6
2016-07-06   8
2016-07-07  10
2016-07-08  12
2016-07-09  13
2016-07-10  14
2016-07-11  16
2016-07-12  18
2016-07-13  20
2016-07-14  22
2016-07-15  24
2016-07-16  25
2016-07-17  26
