In [114]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error

In [115]:
url = "https://github.com/dsrscientist/Data-Science-ML-Capstone-Projects/raw/master/avocado.csv.zip"
df = pd.read_csv(url, compression='zip')

In [116]:
df.head()


Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [117]:
df = df.drop('Unnamed: 0', axis = 1)


In [118]:
df.AveragePrice.describe()

count    18249.000000
mean         1.405978
std          0.402677
min          0.440000
25%          1.100000
50%          1.370000
75%          1.660000
max          3.250000
Name: AveragePrice, dtype: float64

In [119]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df.drop('Date', axis=1, inplace=True)

In [120]:
region_encoder = LabelEncoder()
df['Region_Code'] = region_encoder.fit_transform(df['region'])


In [121]:
features = ['Total Volume', '4046', '4225', '4770', 'Year', 'Month', 'Region_Code']
target = 'AveragePrice'

In [122]:
X = df[features].values
y = df[target].values

In [123]:
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, df['Region_Code'].values, test_size=0.2, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y, test_size=0.2, random_state=42)


In [124]:
scaler = StandardScaler()
X_train_class = scaler.fit_transform(X_train_class)
X_test_class = scaler.transform(X_test_class)
X_train_reg = scaler.fit_transform(X_train_reg)
X_test_reg = scaler.transform(X_test_reg)

In [125]:
classifier = RandomForestClassifier()
classifier.fit(X_train_class, y_train_class)

In [126]:
regressor = LinearRegression()
regressor.fit(X_train_reg, y_train_reg)


In [127]:
y_pred_class = classifier.predict(X_test_class)
classification_report = classification_report(y_test_class, y_pred_class, target_names=region_encoder.classes_)
print("Classification Report:\n", classification_report)

Classification Report:
                      precision    recall  f1-score   support

             Albany       1.00      1.00      1.00        63
            Atlanta       1.00      1.00      1.00        73
BaltimoreWashington       1.00      1.00      1.00        61
              Boise       1.00      1.00      1.00        72
             Boston       1.00      1.00      1.00        67
   BuffaloRochester       1.00      1.00      1.00        73
         California       1.00      1.00      1.00        62
          Charlotte       1.00      1.00      1.00        76
            Chicago       1.00      0.99      0.99        72
   CincinnatiDayton       1.00      1.00      1.00        65
           Columbus       1.00      1.00      1.00        65
      DallasFtWorth       1.00      0.99      0.99        68
             Denver       0.98      1.00      0.99        65
            Detroit       1.00      1.00      1.00        66
        GrandRapids       1.00      1.00      1.00        71

In [128]:
y_pred_reg = regressor.predict(X_test_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.14576028817379552
