# Project 

## Part 1

First of all we should install **yfinance library** and import all libraries those we need.

In [None]:
!pip install yfinance

In [None]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt

**1.** Taking **Gold** & **Oil** data from Yahoo Finance.

In [None]:
symbol1 = "CL=F"
symbol2 = "GC=F"
end_date = "2022-12-31"
start_date = pd.to_datetime(end_date) - pd.DateOffset(years=15) # I use this code to take the data for 15 years befor 2023.

oil = yf.download(symbol1, start=start_date, end=end_date)
gold = yf.download(symbol2, start=start_date, end=end_date)


oil.to_csv("oil_data.csv")
gold.to_csv("gold_data.csv")


print("Oil data:")
print(oil)
print("\n")
print("gold data:")
print(gold)

**2.** Start **sampling** & **matching** data.

* We should find **missing data**.

In [None]:
# To know numbers of missing data
oil_missing_values = oil.isna().sum()
gold_missing_values = gold.isna().sum()

print(oil_missing_values)
print(gold_missing_values)

* Now I want to **resample** data by using resample method. As you can see we don't have missing values but to analyse the data in high accuracy, I want to resapling data.

In [None]:
# Using resample method for rasampling the data for weekly frequncy.
gold_weekly_data = gold.resample("W").mean()
common_index = pd.date_range(start=start_date, end=end_date , freq="W")
gold_aligned_data = gold_weekly_data.reindex(common_index)


oil_weekly_data = oil.resample("W").mean()
common_index = pd.date_range(start=start_date, end=end_date , freq="W")
oil_aligned_data = oil_weekly_data.reindex(common_index)


oil_aligned_missing_values = oil_aligned_data.isna().sum()
gold_aligned_missing_values = gold_aligned_data.isna().sum()


print(oil_aligned_missing_values)
print(gold_aligned_missing_values)

In [None]:
oil_aligned_data.to_csv("oil_aligned_data.csv")
gold_aligned_data.to_csv("gold_aligned_data.csv")



print("Oil aligned data: ")
print(oil_aligned_data)
print("\n")
print("Gold aligned data: ")
print(gold_aligned_data)

* Now I want to **normalize** the data. As you can see, I use **Z-score nomalization** for normalizing data.

In [None]:
# I defined normalization function to normalize gold and oil data.
def normalization(df): 
    df_norm = (df - df.mean())/df.std()
    return df_norm


gold_normalized_data = normalization(gold_aligned_data)
oil_normalized_data = normalization(oil_aligned_data)

print("Oil normalized data: ")
print(oil_normalized_data.describe())
print("\n")
print("Gold normalized data: ")
print(gold_normalized_data.describe())

* Now I want to convert data to **stationary series**. We have different ways to do that but I choose difference transformation because I want to remove **trends** and other **patterns** that vary over time. As you can see below of this cell I ploted all stationary series column to define the effect of converting original data to stationary series. 

In [None]:
# Converting aligned data to stationary data.
oil_stationary_series = oil_aligned_data.diff().dropna()
gold_stationary_series = gold_aligned_data.diff().dropna()

# To making it easier to read i use for loop for printing and plotting here.
print("Oil:")
for column in oil_stationary_series.columns:
    print(oil_stationary_series[f"{column}"].describe())


print("\n")    
print("Gold:")
for column in gold_stationary_series.columns:
    print(gold_stationary_series[f"{column}"].describe())
    

for column in oil_stationary_series.columns:  
    plt.style.use("dark_background")
    plt.figure(figsize=(16,9), dpi=500)
    plt.title(f"{column} of oil", fontsize=20, fontweight='bold')
    plt.xlabel("Date", fontsize=12)
    plt.ylabel("Price", fontsize=12)
    plt.grid(color = "white", linestyle = '--', linewidth = 0.5)
    oil_stationary_series[f"{column}"].plot(linewidth = 1)

  
for column in gold_stationary_series.columns: 
    plt.style.use("dark_background")
    plt.figure(figsize=(16,9), dpi=500)
    plt.title(f"{column} of gold", fontsize=20, fontweight='bold')
    plt.xlabel("Date", fontsize=12)
    plt.ylabel("Price", fontsize=12)
    plt.grid(color = "white", linestyle = '--', linewidth = 0.5)
    gold_stationary_series[f"{column}"].plot(linewidth = 1)

3. **EDA**

* For the first part, we should plot all our data for visualization.

In [None]:
# Again I use for loop for multiple plotting.
for column in oil_aligned_data.columns:  
    plt.style.use("dark_background")
    plt.figure(figsize=(16,9), dpi=500)
    plt.title(f"{column} of oil", fontsize=20, fontweight='bold')
    plt.xlabel("Date", fontsize=12)
    plt.ylabel("Price", fontsize=12)
    plt.grid(color = "white", linestyle = '--', linewidth = 0.5)
    oil_aligned_data[f"{column}"].plot(linewidth = 1)

  
for column in gold_aligned_data.columns: 
    plt.style.use("dark_background")
    plt.figure(figsize=(16,9), dpi=500)
    plt.title(f"{column} of gold", fontsize=20, fontweight='bold')
    plt.xlabel("Date", fontsize=12)
    plt.ylabel("Price", fontsize=12)
    plt.grid(color="white", linestyle='--', linewidth=0.5)
    gold_aligned_data[f"{column}"].plot(linewidth=1)

* For second part we should use **Descriptive Statistics** to understand the distribution of the data. As you can see I use pivot table method to tabled each descriptive statistic.

In [None]:
# Defining aggregation function.
Descriptive_Statistics = ["mean","median","sum","max","min","std","var"]

# Writing for loop and using pivot_table method.
print("Descriptive Statistics of oil")
print("\n")
for func in Descriptive_Statistics:
    oil_pivot_table = oil_aligned_data.pivot_table(values=oil_aligned_data.columns, index=oil_aligned_data.index.year, aggfunc=func)
    print(f"{func}:")
    print(oil_pivot_table)
    print("\n")

In [None]:
# Defining aggregation function.
Descriptive_Statistics = ["mean","median","sum","max","min","std","var"]
  
# Writing for loop and using pivot_table method.
print("Descriptive Statistics of gold")
print("\n")
for func in Descriptive_Statistics:
    gold_pivot_table = gold_aligned_data.pivot_table(values=gold_aligned_data.columns, index=gold_aligned_data.index.year, aggfunc=func)
    print(f"{func}:")
    print(gold_pivot_table)
    print("\n")

* **Correlation & Causality**. For this part we should find correlation and causality between to variables.

In [None]:
# I defined empty list to stop the for loop to calculating repetitious answer.
L = []
for column1 in oil_aligned_data.columns:
    for column2 in oil_aligned_data.columns:
        L.append(column1)
        if column2 in L:
            pass
        else:
            correlation = oil_aligned_data[f"{column1}"].corr(oil_aligned_data[f"{column2}"])
            print(f"Correlation between {column1} and {column2}:", correlation)

In [None]:
# I defined empty list to stop the for loop to calculating repetitious answer.
L = []
for column1 in gold_aligned_data.columns:
    for column2 in gold_aligned_data.columns:
        L.append(column1)
        if column2 in L:
            pass
        else:
            correlation = gold_aligned_data[f"{column1}"].corr(gold_aligned_data[f"{column2}"])
            print(f"Correlation between {column1} and {column2}:", correlation)

4. Trend **analysis & seasonality**

* In this part I want to calculate **moving average of 30-days** on **aligned data**. As you know i resampled data from daily frequency to weekly frequency, so we should calculate moving average 4-weeks. After that I want to plot it with aligned data to find **trends** and any **patterns** if they exist.

In [None]:
# Calculate the moving average of the resampled data over a 4-week period.
# As you can see I defined function to calculating moving average for every columns of data.
def MV_calculater(df):
    new_df = df.copy()
    for column in df.columns:
        new_df[f"{column}"] = df[f"{column}"].rolling(window=4).mean()
    return new_df


oil_mv = MV_calculater(oil_aligned_data).dropna()
gold_mv = MV_calculater(gold_aligned_data).dropna()


print("Oil moving average 4-weeks:")
print(oil_mv)
print("\n")
print("Gold moving average 4-weeks:")
print(gold_mv)

In [None]:
# plotting moving average and original data.
for column in oil_aligned_data.columns:  
    plt.style.use("dark_background")
    plt.figure(figsize=(16,9), dpi=500)
    plt.title(f"{column} of oil", fontsize=20, fontweight='bold')
    plt.xlabel("Date", fontsize=12)
    plt.ylabel("Price", fontsize=12)
    plt.grid(color="white", linestyle='--', linewidth=0.5)
    oil_aligned_data[f"{column}"].plot(linewidth=1, label="Original")
    oil_mv[f"{column}"].plot(linewidth=1, label="4-weeks moving average", linestyle='--', color="red")
    plt.legend()


    
# plotting moving average and original data.
for column in gold_aligned_data.columns: 
    plt.style.use("dark_background")
    plt.figure(figsize=(16,9), dpi=500)
    plt.title(f"{column} of gold", fontsize=20, fontweight='bold')
    plt.xlabel("Date", fontsize=12)
    plt.ylabel("Price", fontsize=12)
    plt.grid(color="white", linestyle='--', linewidth=0.5)
    gold_aligned_data[f"{column}"].plot(linewidth=1, label="Original")
    gold_mv[f"{column}"].plot(linewidth=1, label="4-weeks moving average", linestyle='--', color="red")
    plt.legend()

* With below code I want to show you kind of **trend** for each column.

In [None]:
print("Oil:")
for column in oil_mv.columns:    
    if oil_mv[f"{column}"].iloc[-1] > oil_mv[f"{column}"].iloc[-2]:
        trend = "Upward"
    elif oil_mv[f"{column}"].iloc[-1] < oil_mv[f"{column}"].iloc[-2]:
        trend = "Downward"
    else:
        trend = "Flat"
    print(f"The trend direction is {trend} for {column}.")
    
print("\n")
print("Gold:")
for column in gold_mv.columns:    
    if gold_mv[f"{column}"].iloc[-1] > gold_mv[f"{column}"].iloc[-2]:
        trend = "Upward"
    elif gold_mv[f"{column}"].iloc[-1] < gold_mv[f"{column}"].iloc[-2]:
        trend = "Downward"
    else:
        trend = "Flat"
    print(f"The trend direction is {trend} for {column}.")

* In this part I want to doing **seasonality analysis** on **close** column of each dataframe.

In [None]:
# Extract the month and week from the index
oil["Month"] = oil.index.month
oil["Day"] = oil.index.day

# Calculate the average closing price by month and day
average_price = oil.groupby(["Month", "Day"])["Close"].mean()

# Reshape the data for plotting
average_price = average_price.unstack()

# Plot the seasonality pattern
plt.figure(figsize=(16, 9), dpi=500)
average_price.plot()
plt.title("Seasonality Analysis of Oil Close Prices")
plt.xlabel("Day of the Month")
plt.ylabel("Average Close Price")
plt.legend(title="Month")
plt.show()


In [None]:
# Extract the month and week from the index
gold["Month"] = gold.index.month
gold["Day"] = gold.index.day

# Calculate the average closing price by month and day
average_price = gold.groupby(["Month", "Day"])["Close"].mean()

# Reshape the data for plotting
average_price = average_price.unstack()

# Plot the seasonality pattern
plt.figure(figsize=(16, 9), dpi=500)
average_price.plot()
plt.title("Seasonality Analysis of Gold Close Prices")
plt.xlabel("Day of the Month")
plt.ylabel("Average Close Price")
plt.legend(title="Month")
plt.show()
