In [0]:
import pandas as pd
import numpy as np

# Assignment 1: Series Basics

The code has been previded to create an array, `oil_array` from a dataframe column. 

* Convert `oil_array` into a Pandas Series, called `oil_series`. Give it a name!
* Return the name, dtype, size, and index of `oil_series`.

Take the mean of the values array. 

Then, convert the series to integer datatype and recalculate the mean. 


In [0]:
# create a DataFrame from the oil file, drop missing values
oil = pd.read_csv("../retail/oil.csv").dropna()

# Grab 100 rows of oil prices
oil_array = np.array(oil["dcoilwtico"].iloc[1000:1100])

oil_array

In [0]:
# convert oil_array to a series

oil_series = pd.Series(oil_array, name="oil_prices")
              
oil_series

In [0]:
print(f"Name: {oil_series.name}")
print(f"dtype: {oil_series.dtype}")
print(f"size: {oil_series.size}")
print(f"index: {oil_series.index}")

In [0]:
oil_series.values.mean()

In [0]:
oil_series.index

In [0]:
oil_series.index.dtype

In [0]:
oil_series.astype("int").values.mean()

# Assignment 2:  Accessing Series Data

* Set the date series, which has been created below, to be the index of the oil price series created in assignment 1.


* Then, take the mean of the first 10 and last 10 prices of the series.


* Finally, grab all oil prices from January 1st, 2017 - January 7th, 2017 (inclusive) and set the index to the default integer index.

In [0]:
dates = pd.Series(oil["date"]).iloc[1000:1100]

In [0]:
oil_series.index = dates

oil_series

In [0]:
oil_series.iloc[:10]

In [0]:
# Mean of first 10 prices 

oil_series.iloc[:10].mean()

In [0]:
# Mean of last 10 prices

oil_series.iloc[-10:].mean()

In [0]:
# Slice labels using loc, reset index and drop dates to return series w/ integer index

oil_series.loc["2017-01-01":"2017-01-07"].reset_index(drop=True)

# Assignment 3: Sorting and Filtering Series

* First, get the 10 lowest prices from the data. 
* Sort the 10 lowest prices by date, starting with the most recent and ending with the oldest price.

* Finally, use the list of provided dates. Select only rows with these dates that had a price of less than 50 dollars per barrel.

In [0]:
# list of dates to be used to solve bullet 3

dates = [
    "2016-12-22",
    "2017-05-03",
    "2017-01-06",
    "2017-03-05",
    "2017-02-12",
    "2017-03-21",
    "2017-04-14",
    "2017-04-15",
]

In [0]:
# Get 10 lowest prices by grabbing first 10 rows of sorted price series
# Then, sort by index in descending order

oil_series.sort_values().iloc[:10].sort_index(ascending=False)

In [0]:
# Create mask to filter to only dates in list of dates and oil price <= 50

mask = oil_series.index.isin(dates) & (oil_series <= 50)

oil_series.loc[mask]


# Assignment 4: Series Operations

* Increase the prices in the oil series by 10%, and add an additional 2 dollars per barrel on top of that.

* Then, create a series that represents the difference between each price and max price.

* Finally, extract the month from the string dates in the index and store them as an integer in their own series.

In [0]:
# Multiple oil series values by 1.1 (10% increase), then add 2 to each row

# with Pandas methods
oil_series.mul(1.1).add(2)

# with Python operators
oil_series * 1.1 + 2

In [0]:
# Get max price, store in variable

max_price = oil_series.max()

max_price

In [0]:
# Subtract max price from all rows in oil_series (returns a Series)
(oil_series - max_price) / max_price

In [0]:
# Create a series from the index of oil_series
string_dates = pd.Series(oil_series.index)

In [0]:
# Slice out month portion of text string and convert to int
string_dates.str[5:7].astype("int")

In [0]:
# single line
pd.Series(oil_series.index).str[5:7].astype("int")

# Assignment 5: Series Aggregations

* Calculate the sum and mean of prices in the month of March. 

* Next, calculate how many prices were recorded in January and February.

* Then, calculate the 10th and 90th percentiles across all data.

* Finally, how often did integer dollar value (e.g. 51, 52) occur in the data? Normalize this to a percentage.   

In [0]:
# Filter series to March (month 3), calculate sum of prices, and round

oil_series[oil_series.index.str[6:7] == "3"].sum().round(2)

In [0]:
# Filter series to march, calculate mean

oil_series[oil_series.index.str[6:7] == "3"].mean()

In [0]:
# Filter series to Jan and Feb, count entries

oil_series[oil_series.index.str[5:7].isin(["01", "02"])].count()

In [0]:
# Calculate 10th and 90th percentiles of oil series using quantile

oil_series.quantile([0.1, 0.9])

In [0]:
# Return normalized value counts to get percentage of time each integer dollar value occurred

oil_series.astype("int").value_counts(normalize=True)

# Assignment 6: Missing Data

There were some erroneous prices in our data, so they were filled in with missing values.

Can you confirm the number of missing values in the price column? 

Once you’ve done that, fill the prices in with the median of the oil price series.


In [0]:
# Fill in two values with missing data
oil_series = oil_series.where(~oil_series.isin([51.44, 47.83]), pd.NA)

In [0]:
# Sum/count missing values

oil_series.isna().sum()

In [0]:
# Fill in missing values with median

oil_series.fillna(oil_series.median())

# Exercise 7: Apply and Where

Write a function that outputs ‘buy’ if price is less than the 90th percentile and ‘wait’ if it’s not. Apply it to the oil series.

Then, create a series that multiplies price by .9 if the date is ‘2016-12-23’ or ‘2017-05-10’, and 1.1 for all other dates. 

In [0]:
# Define a function that returns 'Buy' if price below limit, 'Wait' if not.

def buy_bool(price, limit):
    if price < limit:
        return "Buy"
    return "Wait"

In [0]:
# Apply function to OIl Series, args = to specify arguments - make sure to pass a list or tuple to args

oil_series.apply(buy_bool, args=(oil_series.quantile(0.9),))

In [0]:
# Lambda function version of Wait/Buy

oil_series.apply(lambda x: "Buy" if x < oil_series.quantile(0.9) else "Wait")

In [0]:
# Chain Pandas where to specify complementary logic.
# First where - if test returns FALSE (not one of these dates), multiply by 1.1
# Second where - if inverted test returns FALSE (is one of these dates) multiply by .9

(oil_series
 .where(oil_series.index.isin(["2016-12-23", "2017-05-10"]), oil_series * 1.1)
 .where(~oil_series.index.isin(["2016-12-23", "2017-05-10"]), oil_series * .9)
)

In [0]:
# Use NumPy where to modify price based on dates.
# if price in list, multiply by .9
# if price not in list, multiply by 1.1
# Convert NumPy array returned by np.where to Series
import numpy as np

pd.Series(
    np.where(
        oil_series.index.isin(["2016-12-23", "2017-05-10"]),
        oil_series * 0.9,
        oil_series * 1.1,
    )
)