# Python Group Project

## Group Members:
- Khuthadzo Tshikovhele
- Dembe Tsiwana
- Aston Greeves
- Courtney Daza

# Avocado Dataset Analysis

This Notebook provides an analysis of the Avocado dataset.


# Importing Packages
Purpose: Set up the Python environment with necessary libraries and tools.

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

sns.set_style('darkgrid')
plt.style.use('ggplot')

# Data Loading and Exploration

In [3]:
# Load the dataset
url = "Avocado_HassAvocadoBoard_20152023v1.0.1.csv"
avocado_data = pd.read_csv(url)

In [4]:
# Display the first 10 rows of the dataset
avocado_data.head(100)

Unnamed: 0,Date,AveragePrice,TotalVolume,plu4046,plu4225,plu4770,TotalBags,SmallBags,LargeBags,XLargeBags,type,region
0,2015-01-04,1.22,40873.28,2819.50,28287.42,49.90,9716.46,9186.93,529.53,0.0,conventional,Albany
1,2015-01-04,1.79,1373.95,57.42,153.88,0.00,1162.65,1162.65,0.00,0.0,organic,Albany
2,2015-01-04,1.00,435021.49,364302.39,23821.16,82.15,46815.79,16707.15,30108.64,0.0,conventional,Atlanta
3,2015-01-04,1.76,3846.69,1500.15,938.35,0.00,1408.19,1071.35,336.84,0.0,organic,Atlanta
4,2015-01-04,1.08,788025.06,53987.31,552906.04,39995.03,141136.68,137146.07,3990.61,0.0,conventional,BaltimoreWashington
...,...,...,...,...,...,...,...,...,...,...,...,...
95,2015-01-04,1.30,5782.70,723.29,4221.15,0.00,838.26,223.33,614.93,0.0,organic,Spokane
96,2015-01-04,1.15,198735.26,125713.89,34555.73,53.54,38412.10,38400.99,11.11,0.0,conventional,StLouis
97,2015-01-04,1.80,3597.07,1552.48,1521.26,0.00,523.33,523.33,0.00,0.0,organic,StLouis
98,2015-01-04,1.33,41143.51,2506.61,20905.01,16.68,17715.21,17715.21,0.00,0.0,conventional,Syracuse


In [5]:
avocado_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53415 entries, 0 to 53414
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          53415 non-null  object 
 1   AveragePrice  53415 non-null  float64
 2   TotalVolume   53415 non-null  float64
 3   plu4046       53415 non-null  float64
 4   plu4225       53415 non-null  float64
 5   plu4770       53415 non-null  float64
 6   TotalBags     53415 non-null  float64
 7   SmallBags     41025 non-null  float64
 8   LargeBags     41025 non-null  float64
 9   XLargeBags    41025 non-null  float64
 10  type          53415 non-null  object 
 11  region        53415 non-null  object 
dtypes: float64(9), object(3)
memory usage: 4.9+ MB


In [6]:
avocado_data.isnull().sum()

Date                0
AveragePrice        0
TotalVolume         0
plu4046             0
plu4225             0
plu4770             0
TotalBags           0
SmallBags       12390
LargeBags       12390
XLargeBags      12390
type                0
region              0
dtype: int64

In [7]:
# Check for and count duplicated rows
duplicate_count = avocado_data.duplicated().sum()
print(f"Number of duplicated rows: {duplicate_count}")

Number of duplicated rows: 0


In [8]:
# Summary statistics
avocado_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AveragePrice,53415.0,1.42891,0.393116,0.44,1.119091,1.4,1.69,3.44083
TotalVolume,53415.0,869447.441374,3545274.0,84.56,16264.655,120352.46,454238.015,61034460.0
plu4046,53415.0,298270.749448,1307669.0,0.0,694.725,14580.58,128792.38,25447200.0
plu4225,53415.0,222217.037654,955462.4,0.0,2120.8,17516.63,93515.6,20470570.0
plu4770,53415.0,20531.954686,104097.7,0.0,0.0,90.05,3599.735,2860025.0
TotalBags,53415.0,217508.289491,867694.7,0.0,7846.52,36953.1,111014.61,16298300.0
SmallBags,41025.0,103922.170677,569260.8,0.0,0.0,694.58,37952.98,12567160.0
LargeBags,41025.0,23313.164066,149662.2,0.0,0.0,0.0,2814.92,4324231.0
XLargeBags,41025.0,2731.811796,22589.1,0.0,0.0,0.0,0.0,679586.8


In [8]:
# Summary statistics
avocado_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AveragePrice,53415.0,1.42891,0.393116,0.44,1.119091,1.4,1.69,3.44083
TotalVolume,53415.0,869447.441374,3545274.0,84.56,16264.655,120352.46,454238.015,61034460.0
plu4046,53415.0,298270.749448,1307669.0,0.0,694.725,14580.58,128792.38,25447200.0
plu4225,53415.0,222217.037654,955462.4,0.0,2120.8,17516.63,93515.6,20470570.0
plu4770,53415.0,20531.954686,104097.7,0.0,0.0,90.05,3599.735,2860025.0
TotalBags,53415.0,217508.289491,867694.7,0.0,7846.52,36953.1,111014.61,16298300.0
SmallBags,41025.0,103922.170677,569260.8,0.0,0.0,694.58,37952.98,12567160.0
LargeBags,41025.0,23313.164066,149662.2,0.0,0.0,0.0,2814.92,4324231.0
XLargeBags,41025.0,2731.811796,22589.1,0.0,0.0,0.0,0.0,679586.8


In [9]:
# Assuming avocado_data is your DataFrame
summary_stats = avocado_data.describe().T

# Applying some styling
styled_stats = summary_stats.style.set_table_styles(
    [{'selector': 'th', 'props': [('font-size', '12pt'), ('text-align', 'center')]},
     {'selector': 'td', 'props': [('font-size', '10pt')]}]
).set_caption("Summary Statistics")

# Display the styled table
styled_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AveragePrice,53415.0,1.42891,0.393116,0.44,1.119091,1.4,1.69,3.44083
TotalVolume,53415.0,869447.441374,3545273.998345,84.56,16264.655,120352.46,454238.015,61034457.1
plu4046,53415.0,298270.749448,1307669.329406,0.0,694.725,14580.58,128792.38,25447201.87
plu4225,53415.0,222217.037654,955462.406892,0.0,2120.8,17516.63,93515.6,20470572.61
plu4770,53415.0,20531.954686,104097.691562,0.0,0.0,90.05,3599.735,2860025.19
TotalBags,53415.0,217508.289491,867694.737352,0.0,7846.52,36953.1,111014.61,16298296.29
SmallBags,41025.0,103922.170677,569260.825954,0.0,0.0,694.58,37952.98,12567155.58
LargeBags,41025.0,23313.164066,149662.244458,0.0,0.0,0.0,2814.92,4324231.19
XLargeBags,41025.0,2731.811796,22589.096454,0.0,0.0,0.0,0.0,679586.8
