# About

Here is a description for each column in your dataset:

- **Date**: The date of the trading session, typically formatted as YYYY-MM-DD.

- **Open**: The price at which Coca-Cola's stock began trading at the start of the session.

- **High**: The highest price that Coca-Cola's stock reached during the trading session.

- **Low**: The lowest price that Coca-Cola's stock reached during the trading session.

- **Close**: The price at which Coca-Cola's stock closed at the end of the trading session.

- **Adj Close**: The adjusted close price, which accounts for corporate actions such as dividends and stock splits. This value provides a more accurate reflection of the stock's value.

- **Volume**: The total number of shares traded during the trading session, indicating the level of maket activity.


# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Overview

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/CelioMaciel179/PDS002_ANALISE_CocaCola/main/datasets/Coca%20Cola.csv')

In [3]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-01-02,46.939999,47.220001,46.560001,46.93,39.828789,11603700
1,2019-01-03,46.82,47.369999,46.529999,46.639999,39.582672,14714400
2,2019-01-04,46.75,47.57,46.639999,47.57,40.371952,13013700
3,2019-01-07,47.57,47.75,46.900002,46.950001,39.845768,13135500
4,2019-01-08,47.25,47.57,47.040001,47.48,40.295567,15420700


In [4]:
df.shape

(1258, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1258 non-null   object 
 1   Open       1258 non-null   float64
 2   High       1258 non-null   float64
 3   Low        1258 non-null   float64
 4   Close      1258 non-null   float64
 5   Adj Close  1258 non-null   float64
 6   Volume     1258 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 68.9+ KB


# Preparation

In [6]:
# checking null values
df.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [7]:
# Checking duplicates values
df.duplicated().sum()

0

In [8]:
# Convert to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [9]:
# Set palette
coca_cola_palette = {
    'red': '#DA291C',
    'white': '#FFFFFF',
    'black': '#000000'
}
sns.set_palette(coca_cola_palette)

# Statistical summary

In [10]:
df.describe()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
count,1258,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0
mean,2021-06-30 18:48:38.918918912,55.284014,55.685572,54.846208,55.266399,50.890645,15094190.0
min,2019-01-02 00:00:00,38.759998,38.889999,36.27,37.560001,33.175083,3265500.0
25%,2020-04-01 06:00:00,50.975001,51.315,50.470001,50.8625,45.064361,10944720.0
50%,2021-06-30 12:00:00,55.004999,55.365,54.754999,55.025,50.465061,13660250.0
75%,2022-09-28 18:00:00,60.200001,60.630001,59.7475,60.2275,57.792684,17170580.0
max,2023-12-29 00:00:00,67.0,67.199997,65.720001,66.209999,62.319378,67845700.0
std,,5.686688,5.681123,5.703617,5.709911,7.065449,6928657.0


# Distribution Analysis

In [11]:
n = df.shape[0]

# Calculate the number of bins using Sturges' rule
bins = int(np.ceil(np.log2(n) + 1))

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(14, 14))

sns.histplot(df['Open'], bins=bins, kde=True, ax=axs[0, 0], color=coca_cola_palette['red'])
axs[0, 0].set_title('Distribution of Opening Prices')

sns.histplot(df['High'], bins=bins, kde=True, ax=axs[0, 1], color=coca_cola_palette['red'])
axs[0, 1].set_title('Distribution of Highest Prices')

sns.histplot(df['Low'], bins=bins, kde=True, ax=axs[1, 0], color=coca_cola_palette['red'])
axs[1, 0].set_title('Distribution of Lowest Prices')

sns.histplot(df['Close'], bins=bins, kde=True, ax=axs[1, 1], color=coca_cola_palette['red'])
axs[1, 1].set_title('Distribution of Closing Prices')

sns.histplot(df['Adj Close'], bins=bins, kde=True, ax=axs[2, 0], color=coca_cola_palette['red'])
axs[2, 0].set_title('Distribution of Adjusted Closing Prices')

sns.histplot(df['Volume'], bins=bins, kde=True, ax=axs[2, 1], color=coca_cola_palette['red'])
axs[2, 1].set_title('Distribution of Trading Volumes')

plt.tight_layout()
plt.show()

# Relationship Analysis

In [None]:
columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

sns.pairplot(df[columns], diag_kind='kde', plot_kws={'alpha':0.5})

plt.suptitle('Pairplot of Stock Prices', y=1.02) 

plt.show()

# Correlation Analysis

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Matriz de Correlação')
plt.show()

# Outlier Analysis

In [None]:
columns = ['Open', 'High', 'Low', 'Close', 'Adj Close']

plt.figure(figsize=(12, 6))
sns.violinplot(data=df[columns], inner='quartile', palette='muted')

plt.title('Violin Plot of Stock Prices')
plt.xlabel('Variables')
plt.ylabel('Price')
plt.show()

In [None]:
columns = ['Open', 'High', 'Low', 'Close', 'Adj Close']

# Criar o boxplot
plt.figure(figsize=(14, 8))
sns.boxplot(data=df[columns], palette='muted')

# Adicionar título ao gráfico
plt.title('Boxplot of Stock Prices')
plt.xlabel('Variables')
plt.ylabel('Price')
plt.show()

In [None]:
# Criar o boxplot para a variável 'Volume'
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['Volume'], palette='muted')

# Adicionar título ao gráfico
plt.title('Boxplot of Trading Volume')
plt.xlabel('Volume')
plt.show()

# Temporal Analysis

In [None]:
df['Rolling Mean'] = df['Close'].rolling(window=30).mean()

plt.figure(figsize=(14, 7))

sns.lineplot(x=df['Date'], y=df['Close'], color='blue', label='Closing Price')

sns.lineplot(x=df['Date'], y=df['Rolling Mean'], color='red', label='30-Day Rolling Mean')

plt.title('Closing Price and 30-Day Rolling Mean')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()

plt.show()

In [None]:
df_monthly = df.set_index('Date').resample('M').mean()

plt.figure(figsize=(14, 7))
sns.lineplot(x=df_monthly.index, y=df_monthly['Adj Close'], color='green', label='Monthly Average Adjusted Close Price')

plt.title('Monthly Average of Adjusted Close Price')
plt.xlabel('Date')
plt.ylabel('Average Adjusted Close Price')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(14, 7))
sns.lineplot(x=df['Date'], y=df['Volume'], color='orange', label='Volume')

# Adicionar título e rótulos aos eixos
plt.title('Trading Volume Over Time')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.legend()
plt.show()
