The original dataset was in Microsoft Excel, and the following changes were made there before exporting to CSV:

- Remove data from before 1990
- Edit column names to include relevant information

### Note: All prices are represented in cents per pound

The following code cleans the data and exports for use.

In [1]:
import pandas as pd

In [2]:
file_path = "beef.csv"
beef_df = pd.read_csv(file_path)
beef_df

Unnamed: 0.1,Unnamed: 0,Byproduct (B),Gross farm value (C),Net farm value (D = C - B),Wholesale value (E),Retail value (F),Total Spread (F -D),Farm-wholesale Spread (E - D),Wholesale-retail (F - E),All fresh beef retail value
0,January-90,21.8,189.4,167.6,187.1,274.4,106.8,19.5,87.3,252.4
1,February-90,21.5,188.7,167.2,186.0,271.0,103.8,18.8,85.0,259.1
2,March-90,21.1,190.4,169.3,187.7,272.5,103.2,18.4,84.8,257.3
3,April-90,21.2,192.0,170.8,190.1,277.9,107.1,19.3,87.8,261.4
4,May-90,20.7,187.9,167.2,191.6,283.6,116.4,24.4,92.0,259.2
...,...,...,...,...,...,...,...,...,...,...
391,August-22,33.6,336.0,302.4,397.5,757.6,455.2,95.1,360.1,732.4
392,September-22,38.7,341.6,302.9,380.5,760.1,457.1,77.6,379.6,732.4
393,October-22,42.7,355.2,312.5,378.7,742.2,429.6,66.1,363.5,724.8
394,November-22,38.9,364.2,325.2,389.2,736.5,411.3,64.0,347.3,714.9


In [4]:
# Change name of date data column and fix error in wholesale-retail column name
beef_df.rename(mapper={"Unnamed: 0": "Date", "Wholesale-retail (F - E)": "Wholesale-retail Spread (F - E)"}, axis=1, inplace=True)
beef_df.head()

Unnamed: 0,Date,Byproduct (B),Gross farm value (C),Net farm value (D = C - B),Wholesale value (E),Retail value (F),Total Spread (F -D),Farm-wholesale Spread (E - D),Wholesale-retail Spread (F - E),All fresh beef retail value
0,January-90,21.8,189.4,167.6,187.1,274.4,106.8,19.5,87.3,252.4
1,February-90,21.5,188.7,167.2,186.0,271.0,103.8,18.8,85.0,259.1
2,March-90,21.1,190.4,169.3,187.7,272.5,103.2,18.4,84.8,257.3
3,April-90,21.2,192.0,170.8,190.1,277.9,107.1,19.3,87.8,261.4
4,May-90,20.7,187.9,167.2,191.6,283.6,116.4,24.4,92.0,259.2


In [5]:
# Check value types 
beef_df.dtypes

Date                                object
Byproduct (B)                      float64
Gross farm value (C)               float64
Net farm value (D = C - B)         float64
Wholesale value (E)                float64
Retail value (F)                   float64
Total Spread (F -D)                float64
Farm-wholesale Spread (E - D)      float64
Wholesale-retail Spread (F - E)    float64
All fresh beef retail value        float64
dtype: object

In [6]:
# Date column needs to be converted to datetime format
# first split it into months and years columns
beef_df[["Month", "Year"]] = beef_df["Date"].str.split("-", expand=True)
    
beef_df.sample(20)

Unnamed: 0,Date,Byproduct (B),Gross farm value (C),Net farm value (D = C - B),Wholesale value (E),Retail value (F),Total Spread (F -D),Farm-wholesale Spread (E - D),Wholesale-retail Spread (F - E),All fresh beef retail value,Month,Year
368,September-20,16.9,249.9,233.0,336.7,637.1,404.1,103.8,300.4,627.9,September,20
3,April-90,21.2,192.0,170.8,190.1,277.9,107.1,19.3,87.8,261.4,April,90
123,April-00,17.3,176.2,158.9,191.0,305.4,146.5,32.1,114.4,272.5,April,0
210,July-07,25.5,216.3,190.7,218.1,414.3,223.5,27.3,196.2,379.1,July,7
200,September-06,20.6,216.7,196.0,224.9,392.6,196.6,28.9,167.7,357.9,September,6
203,December-06,21.8,207.1,185.3,221.6,392.1,206.8,36.4,170.4,361.7,December,6
207,April-07,24.0,234.4,210.4,251.9,428.5,218.1,41.5,176.6,376.7,April,7
378,July-21,25.8,284.9,259.1,414.0,752.9,493.8,155.0,338.9,710.2,July,21
198,July-06,17.9,195.0,177.1,228.4,387.8,210.7,51.3,159.4,353.6,July,6
72,January-96,20.0,153.4,133.4,156.2,281.5,148.1,22.8,125.3,257.7,January,96


In [7]:
beef_df.dtypes

Date                                object
Byproduct (B)                      float64
Gross farm value (C)               float64
Net farm value (D = C - B)         float64
Wholesale value (E)                float64
Retail value (F)                   float64
Total Spread (F -D)                float64
Farm-wholesale Spread (E - D)      float64
Wholesale-retail Spread (F - E)    float64
All fresh beef retail value        float64
Month                               object
Year                                object
dtype: object

In [8]:
# Convert Year column to int and add necessary 19 or 20 to complete the year
beef_df.Year = beef_df.Year.astype(int)

year_values = beef_df.Year.values.copy()

for i in range(len(year_values)):
    if year_values[i] >= 90:
        year_values[i] += 1900
    else:
        year_values[i] += 2000
        
beef_df["New Year"] = year_values

beef_df.sample(10)

Unnamed: 0,Date,Byproduct (B),Gross farm value (C),Net farm value (D = C - B),Wholesale value (E),Retail value (F),Total Spread (F -D),Farm-wholesale Spread (E - D),Wholesale-retail Spread (F - E),All fresh beef retail value,Month,Year,New Year
33,October-92,20.0,180.1,160.1,177.5,285.6,125.5,17.4,108.1,266.9,October,92,1992
130,November-00,19.9,172.3,152.4,182.8,310.3,157.8,30.3,127.5,279.6,November,0,2000
352,May-19,19.0,279.0,260.0,342.5,617.3,357.3,82.5,274.8,590.6,May,19,2019
89,June-97,20.7,153.9,133.2,156.1,277.6,144.4,22.9,121.5,251.7,June,97,1997
221,June-08,28.3,229.4,201.1,243.1,429.6,228.4,42.0,186.5,393.5,June,8,2008
363,April-20,13.9,248.3,234.4,402.0,644.4,409.9,167.5,242.4,622.2,April,20,2020
51,April-94,19.9,180.7,160.8,176.8,287.1,126.3,16.0,110.3,269.9,April,94,1994
160,May-03,17.3,189.7,172.4,219.9,361.4,189.0,47.5,141.5,317.5,May,3,2003
379,August-21,24.8,291.2,266.3,486.5,763.9,497.5,220.1,277.4,714.1,August,21,2021
140,September-01,17.6,164.5,146.9,186.6,337.6,190.8,39.8,151.0,301.2,September,1,2001


In [9]:
# Combine Month and New Year columns into one datetime column

# Following line adapted from: 
# https://stackoverflow.com/questions/50663700/convert-year-and-month-name-into-datetime-column-for-pandas-dataframe
# Helpful: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

beef_df["DateTime"] = pd.to_datetime(beef_df["New Year"].astype(str) + beef_df["Month"], format="%Y%B")
beef_df.sample(10)

Unnamed: 0,Date,Byproduct (B),Gross farm value (C),Net farm value (D = C - B),Wholesale value (E),Retail value (F),Total Spread (F -D),Farm-wholesale Spread (E - D),Wholesale-retail Spread (F - E),All fresh beef retail value,Month,Year,New Year,DateTime
166,November-03,24.4,241.6,217.2,257.4,431.7,214.5,40.2,174.3,362.5,November,3,2003,2003-11-01
343,August-18,20.8,266.0,245.2,320.7,608.2,363.0,75.5,287.5,579.1,August,18,2018,2018-08-01
366,July-20,15.1,228.9,213.9,311.3,683.8,470.0,97.4,372.5,674.3,July,20,2020,2020-07-01
196,May-06,16.6,189.7,173.2,229.3,395.1,221.9,56.1,165.8,359.1,May,6,2006,2006-05-01
180,January-05,18.9,215.1,196.2,231.4,408.5,212.3,35.3,177.0,359.6,January,5,2005,2005-01-01
169,February-04,19.9,189.3,169.4,198.5,398.9,229.5,29.1,200.4,360.3,February,4,2004,2004-02-01
305,June-15,31.8,362.8,331.1,382.4,640.6,309.6,51.4,258.2,610.5,June,15,2015,2015-06-01
29,June-92,18.1,177.5,159.4,180.8,287.1,127.7,21.4,106.3,265.3,June,92,1992,1992-06-01
147,April-02,16.3,161.9,145.6,182.8,333.5,187.9,37.2,150.7,306.5,April,2,2002,2002-04-01
140,September-01,17.6,164.5,146.9,186.6,337.6,190.8,39.8,151.0,301.2,September,1,2001,2001-09-01


In [10]:
# Check work
beef_df.sample(20)

Unnamed: 0,Date,Byproduct (B),Gross farm value (C),Net farm value (D = C - B),Wholesale value (E),Retail value (F),Total Spread (F -D),Farm-wholesale Spread (E - D),Wholesale-retail Spread (F - E),All fresh beef retail value,Month,Year,New Year,DateTime
207,April-07,24.0,234.4,210.4,251.9,428.5,218.1,41.5,176.6,376.7,April,7,2007,2007-04-01
289,February-14,40.0,350.4,310.4,327.7,557.5,247.1,17.3,229.9,527.1,February,14,2014,2014-02-01
282,July-13,36.7,284.7,248.0,290.3,533.1,285.1,42.3,242.7,495.5,July,13,2013,2013-07-01
50,March-94,19.6,180.2,160.6,176.9,288.3,127.7,16.3,111.4,267.7,March,94,1994,1994-03-01
148,May-02,16.0,157.4,141.4,180.7,333.5,192.1,39.3,152.8,309.0,May,2,2002,2002-05-01
177,October-04,19.0,204.0,184.9,212.7,402.1,217.1,27.8,189.3,360.0,October,4,2004,2004-10-01
191,December-05,19.8,231.6,211.8,244.2,406.0,194.2,32.4,161.8,364.5,December,5,2005,2005-12-01
361,February-20,22.5,284.5,262.0,317.8,604.5,342.4,55.7,286.7,593.5,February,20,2020,2020-02-01
143,December-01,15.6,153.4,137.8,177.3,330.3,192.5,39.5,153.0,303.3,December,1,2001,2001-12-01
308,September-15,29.3,326.5,297.2,348.8,623.0,325.9,51.6,274.2,605.9,September,15,2015,2015-09-01


In [11]:
# Create new DF with DateTime column as Date and unneeded columns removed

new_columns = list(beef_df.columns)

remove_cols = ["Date", "Month", "Year", "New Year"]

for i in remove_cols:
    new_columns.remove(i)

new_columns.insert(0, new_columns.pop(-1))

cleaned_beef_df = beef_df[new_columns]

cleaned_beef_df.head()

Unnamed: 0,DateTime,Byproduct (B),Gross farm value (C),Net farm value (D = C - B),Wholesale value (E),Retail value (F),Total Spread (F -D),Farm-wholesale Spread (E - D),Wholesale-retail Spread (F - E),All fresh beef retail value
0,1990-01-01,21.8,189.4,167.6,187.1,274.4,106.8,19.5,87.3,252.4
1,1990-02-01,21.5,188.7,167.2,186.0,271.0,103.8,18.8,85.0,259.1
2,1990-03-01,21.1,190.4,169.3,187.7,272.5,103.2,18.4,84.8,257.3
3,1990-04-01,21.2,192.0,170.8,190.1,277.9,107.1,19.3,87.8,261.4
4,1990-05-01,20.7,187.9,167.2,191.6,283.6,116.4,24.4,92.0,259.2


In [12]:
# check datatypes
cleaned_beef_df.dtypes

DateTime                           datetime64[ns]
Byproduct (B)                             float64
Gross farm value (C)                      float64
Net farm value (D = C - B)                float64
Wholesale value (E)                       float64
Retail value (F)                          float64
Total Spread (F -D)                       float64
Farm-wholesale Spread (E - D)             float64
Wholesale-retail Spread (F - E)           float64
All fresh beef retail value               float64
dtype: object

In [13]:
# Export cleaned DF to new csv
output_path = "cleaned_beef.csv"
cleaned_beef_df.to_csv(output_path)