In [None]:
# -------------------------------------------------------- Imports --------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

In [None]:
# -------------------------------------------------------- Load Data --------------------------------------------------------
pumpkins = pd.read_csv('US-pumpkins.csv')
pumpkins.head()

In [None]:
# -------------------------------------------------------- Shape Data --------------------------------------------------------
pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]
columns_to_select = ['Package', 'Variety', 'City Name', 'Low Price', 'High Price', 'Date']
pumpkins = pumpkins.loc[:, columns_to_select]
price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2
month = pd.DatetimeIndex(pumpkins['Date']).month
day_of_year = pd.to_datetime(pumpkins['Date']).apply(lambda dt: (dt-datetime(dt.year,1,1)).days)
new_pumpkins = pd.DataFrame(
    {'Month': month, 
     'DayOfYear' : day_of_year, 
     'Variety': pumpkins['Variety'], 
     'City': pumpkins['City Name'], 
     'Package': pumpkins['Package'], 
     'Low Price': pumpkins['Low Price'],
     'High Price': pumpkins['High Price'], 
     'Price': price})
new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = price/1.1
new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = price*2
new_pumpkins.head()

In [None]:
# -------------------------------------------------------- Visualize --------------------------------------------------------
import matplotlib.pyplot as plt
plt.scatter('Month','Price',data=new_pumpkins)
plt.scatter('DayOfYear','Price',data=new_pumpkins)

In [None]:
# ----- Correlation -----
print(new_pumpkins['Month'].corr(new_pumpkins['Price']))
print(new_pumpkins['DayOfYear'].corr(new_pumpkins['Price']))

In [None]:
# ----- Scatter Graph All Types -----
ax=None
colors = ['red','blue','green','yellow']
for i,var in enumerate(new_pumpkins['Variety'].unique()):
    df = new_pumpkins[new_pumpkins['Variety']==var]
    ax = df.plot.scatter('DayOfYear','Price',ax=ax,c=colors[i],label=var)

In [None]:
# ----- Bar Chart All Types -----
new_pumpkins.groupby('Variety')['Price'].mean().plot(kind='bar')

In [None]:
# ----- Scatter Graph Pie Type -----
pie_pumpkins = new_pumpkins[new_pumpkins['Variety']=='PIE TYPE']
pie_pumpkins.plot.scatter('DayOfYear','Price') 

In [None]:
pie_pumpkins.dropna(inplace=True)
pie_pumpkins.info()

In [None]:
# ---------------------------------------------- Linear Regression ----------------------------------------------

In [None]:
# ----- Imports -----
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
# ----- Clean & Split Data -----
X = pie_pumpkins['DayOfYear'].to_numpy().reshape(-1,1)
y = pie_pumpkins['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# ----- Model & Train -----
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

In [None]:
# ----- Predict-----
pred = lin_reg.predict(X_test)

In [None]:
# ----- Evaluate -----
# MSE
mse = np.sqrt(mean_squared_error(y_test,pred))
print(f'Mean error: {mse:3.3} ({mse/np.mean(pred)*100:3.3}%)')
# Coefficent Of Determination
score = lin_reg.score(X_train,y_train)
print('Model determination: ', score)

In [None]:
# ----- Visualize -----
plt.scatter(X_test,y_test)
plt.plot(X_test,pred)

In [None]:
# ---------------------------------------------- Polynomial Regression ----------------------------------------------

In [None]:


from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(PolynomialFeatures(2), LinearRegression())
pipeline.fit(X_train,y_train)


In [None]:

# ----- Split Data -----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# ----- Shape Training Data -----
X = pd.get_dummies(new_pumpkins['Variety']) \
        .join(new_pumpkins['Month']) \
        .join(pd.get_dummies(new_pumpkins['City'])) \
        .join(pd.get_dummies(new_pumpkins['Package']))
y = new_pumpkins['Price']

In [None]:
# ----- Model & Train -----
pipeline = make_pipeline(PolynomialFeatures(2), LinearRegression())
pipeline.fit(X_train,y_train)

In [None]:
# ----- Predict -----
pred = pipeline.predict(X_test)

In [None]:
# ----- Evaluate -----
# MSE 
mse = np.sqrt(mean_squared_error(y_test,pred))
print(f'Mean error: {mse:3.3} ({mse/np.mean(pred)*100:3.3}%)')
# Model Determination
score = pipeline.score(X_train,y_train)
print('Model determination: ', score)