In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import missingno as msno



from statsmodels.tsa.seasonal import seasonal_decompose
import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
sns.set_palette("deep")
plt.rcParams['figure.figsize'] = (14, 7)


In [None]:
data = yf.download("AAPL", start="1980-12-12", end="2024-12-31")
data = data.iloc[:10000]
print(data.columns)
print(data.head())


In [None]:
print("Shape:", data.shape)
print("Columns:", data.columns)
print("Data Types:\n", data.dtypes)
print(data.info())


In [None]:
print("Missing values:\n", data.isnull().sum())




In [None]:
print(data.describe())
print("Skewness:\n", data.skew())
print("Kurtosis:\n", data.kurtosis())


In [None]:
data.hist(bins=50, figsize=(15, 10))
plt.tight_layout()
plt.show()

sns.kdeplot(data['Close'], fill=True)
plt.title("Close Price Distribution")
plt.show()


In [None]:
corr = data.corr()
sns.heatmap(corr, annot=True, cmap="Greens")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
data['Year'] = data.index.year
yearly_avg = data.groupby('Year')['Close'].mean()
yearly_avg.plot()
plt.title("Yearly Average Close Price")
plt.xlabel("Year")
plt.ylabel("Average Close")
plt.grid(True)
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate outliers based on IQR
Q1 = data['Volume'].quantile(0.25)
Q3 = data['Volume'].quantile(0.75)
IQR = Q3 - Q1
outliers = ((data['Volume'] < (Q1 - 1.5 * IQR)) | (data['Volume'] > (Q3 + 1.5 * IQR)))
print(f"Outliers in Volume: {outliers.sum()}")

# Boxplot for Volume
plt.figure(figsize=(10, 6))
sns.boxplot(data['Volume'])
plt.title('Volume Outlier Detection')
plt.show()


In [None]:
# Plot the histogram of Volume before handling outliers
plt.figure(figsize=(10, 6))
sns.histplot(data['Volume'], bins=50, kde=True)
plt.title('Distribution of Volume Before Handling Outliers')
plt.show()

# Plot the histogram of Volume after handling outliers (for example, capping)
data_no_outliers = data[~outliers]
plt.figure(figsize=(10, 6))
sns.histplot(data_no_outliers['Volume'], bins=50, kde=True)
plt.title('Distribution of Volume After Handling Outliers')
plt.show()


In [None]:
before_mean = data['Volume'].mean()
before_median = data['Volume'].median()
print(f"Before handling outliers: Mean = {before_mean}, Median = {before_median}")

data_no_outliers = data[~outliers]
after_mean = data_no_outliers['Volume'].mean()
after_median = data_no_outliers['Volume'].median()
print(f"After handling outliers: Mean = {after_mean}, Median = {after_median}")


In [None]:
before_skewness = data['Volume'].skew()
after_skewness = data_no_outliers['Volume'].skew()
print(f"Before handling outliers: Skewness = {before_skewness}")
print(f"After handling outliers: Skewness = {after_skewness}")


In [None]:
before_kurtosis = data['Volume'].kurtosis()
after_kurtosis = data_no_outliers['Volume'].kurtosis()
print(f"Before handling outliers: Kurtosis = {before_kurtosis}")
print(f"After handling outliers: Kurtosis = {after_kurtosis}")



In [None]:
# Visualizing after removing outliers
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(data_no_outliers['Volume'], kde=True)
plt.title('Volume Distribution After Removing Outliers')

plt.subplot(1, 2, 2)
sns.boxplot(data_no_outliers['Volume'])
plt.title('Volume Boxplot After Removing Outliers')

plt.show()

In [None]:
from scipy.stats import skew, kurtosis
data = yf.download("AAPL", start="2010-01-01", end="2024-12-31")

volume = data[['Volume']].squeeze()

print("Before handling outliers:")
print("Mean =", volume.mean())
print("Median =", volume.median())
print("Skewness =", skew(volume))
print("Kurtosis =", kurtosis(volume))

lower_cap = volume.quantile(0.01)
upper_cap = volume.quantile(0.99)
volume_capped = volume.clip(lower=lower_cap, upper=upper_cap)


print("\nAfter handling outliers:")
print("Mean =", volume_capped.mean())
print("Median =", volume_capped.median())
print("Skewness =", skew(volume_capped))
print("Kurtosis =", kurtosis(volume_capped))


## Feature Engneering

In [None]:
data = yf.download("AAPL", start="1980-12-12", end="2024-12-31")
data = data.iloc[:10000]

data['Daily_Return'] = data['Close'].pct_change()
data['Volatility'] = data['Daily_Return'].rolling(window=7).std()

delta = data['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()

rs = gain / loss
data['RSI'] = 100 - (100 / (1 + rs))

fig, ax = plt.subplots(3, 1, figsize=(10, 12))


ax[0].plot(data.index, data['Daily_Return'], label='Daily Return', color='blue')
ax[0].set_title('Daily Return')
ax[0].set_xlabel('Date')
ax[0].set_ylabel('Return')
ax[0].legend()


ax[1].plot(data.index, data['Volatility'], label='Volatility (7-day rolling)', color='red')
ax[1].set_title('Volatility')
ax[1].set_xlabel('Date')
ax[1].set_ylabel('Volatility')
ax[1].legend()

ax[2].plot(data.index, data['RSI'], label='RSI (14-day)', color='green')
ax[2].axhline(70, color='red', linestyle='--', label='Overbought (70)')
ax[2].axhline(30, color='blue', linestyle='--', label='Oversold (30)')
ax[2].set_title('Relative Strength Index (RSI)')
ax[2].set_xlabel('Date')
ax[2].set_ylabel('RSI')
ax[2].legend()

plt.tight_layout()
plt.show()


In [None]:
data = yf.download("AAPL", start="1980-12-12", end="2024-12-31")


data.index = pd.to_datetime(data.index)

data['Year'] = data.index.year
data['Month'] = data.index.month
data['Day'] = data.index.day
data['DayOfWeek'] = data.index.dayofweek
data['Quarter'] = data.index.quarter


sns.set(style="whitegrid")


plt.figure(figsize=(14, 5))
sns.countplot(x='Year', data=data)
plt.title('Number of Trading Days per Year')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


plt.figure(figsize=(10, 4))
sns.countplot(x='Month', data=data)
plt.title('Number of Trading Days per Month')
plt.tight_layout()
plt.show()


plt.figure(figsize=(12, 4))
sns.countplot(x='Day', data=data)
plt.title('Number of Trading Days per Calendar Day')
plt.tight_layout()
plt.show()


plt.figure(figsize=(8, 4))
sns.countplot(x='DayOfWeek', data=data)
plt.title('Number of Trading Days per Day of Week (0=Mon, 6=Sun)')
plt.tight_layout()
plt.show()


plt.figure(figsize=(8, 4))
sns.countplot(x='Quarter', data=data)
plt.title('Number of Trading Days per Quarter')
plt.tight_layout()
plt.show()


In [None]:
print(data.columns)


In [None]:
data[('Close', 'AAPL')] = pd.to_numeric(data[('Close', 'AAPL')], errors='coerce')

data[('Pct_Change_', '')] = data[('Close', 'AAPL')].pct_change()
data[('Prev_Close_', '')] = data[('Close', 'AAPL')].shift(1)
data[('Close_Change_', '')] = data[('Close', 'AAPL')] - data[('Prev_Close_', '')]

data.dropna(subset=[('Pct_Change_', ''), ('Prev_Close_', ''), ('Close_Change_', '')], inplace=True)


plt.figure(figsize=(14, 6))
plt.plot(data.index, data[('Pct_Change_', '')], label='Daily % Change')
plt.plot(data.index, data[('Close_Change_', '')].rolling(window=5).mean(), label='5-day Avg Close Change', color='orange')
plt.title('AAPL Daily % Change and Close Price Change')
plt.xlabel('Date')
plt.ylabel('Change')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# **Ridge **

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

X = data[['50_Day_MA', '200_Day_MA', 'RSI', 'Volatility']]
y = data['Close']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ridge = Ridge()
param_grid = {'alpha': [0.1, 1, 10, 100, 1000]}
grid = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train_scaled, y_train)

best_ridge = grid.best_estimator_
y_pred_ridge = best_ridge.predict(X_test_scaled)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f" Ridge Regression - Best Alpha: {grid.best_params_['alpha']}")
print(f"R²: {r2_ridge:.4f}, RMSE: {np.sqrt(mse_ridge):.4f}")

poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_pred_poly = poly_model.predict(X_test_poly)

mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)
print(" Polynomial Linear Regression (Degree=2)")
print(f"R²: {r2_poly:.4f}, RMSE: {np.sqrt(mse_poly):.4f}")

alphas = [0.1, 1, 10, 100, 1000, 10000]
best_alpha = None
best_r2 = -np.inf

print(" Manual Alpha Tuning:")
for alpha in alphas:
    model = Ridge(alpha=alpha)
    model.fit(X_train_scaled, y_train)
    r2 = r2_score(y_test, model.predict(X_test_scaled))
    print(f"Alpha {alpha:<6} --> R²: {r2:.4f}")
    if r2 > best_r2:
        best_r2 = r2
        best_alpha = alpha


final_ridge = Ridge(alpha=best_alpha)
final_ridge.fit(X_train_scaled, y_train)
y_final_pred = final_ridge.predict(X_test_scaled)
final_mse = mean_squared_error(y_test, y_final_pred)

print(f" Final Ridge Model (Alpha={best_alpha})")
print(f"R²: {r2_score(y_test, y_final_pred):.4f}, RMSE: {np.sqrt(final_mse):.4f}")


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

y_train_pred_ridge = best_ridge.predict(X_train_scaled)
train_mse_ridge = mean_squared_error(y_train, y_train_pred_ridge)
train_r2_ridge = r2_score(y_train, y_train_pred_ridge)


y_test_pred_ridge = best_ridge.predict(X_test_scaled)
test_mse_ridge = mean_squared_error(y_test, y_test_pred_ridge)
test_r2_ridge = r2_score(y_test, y_test_pred_ridge)

print("Ridge Model - Training Set")
print(f"R²: {train_r2_ridge:.4f}, RMSE: {np.sqrt(train_mse_ridge):.4f}")

print("Ridge Model - Test Set")
print(f"R²: {test_r2_ridge:.4f}, RMSE: {np.sqrt(test_mse_ridge):.4f}")





In [None]:
y_train_pred_poly = poly_model.predict(X_train_poly)
train_mse_poly = mean_squared_error(y_train, y_train_pred_poly)
train_r2_poly = r2_score(y_train, y_train_pred_poly)

y_test_pred_poly = poly_model.predict(X_test_poly)
test_mse_poly = mean_squared_error(y_test, y_test_pred_poly)
test_r2_poly = r2_score(y_test, y_test_pred_poly)

print("Polynomial Linear Regression - Training Set")
print(f"R²: {train_r2_poly:.4f}, RMSE: {np.sqrt(train_mse_poly):.4f}")



In [None]:
print("Polynomial Linear Regression - Test Set")
print(f"R²: {test_r2_poly:.4f}, RMSE: {np.sqrt(test_mse_poly):.4f}")

# **XGBRegressor**

In [None]:
import yfinance as yf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

def load_data():
    data = yf.download("AAPL", start="1980-12-15", end="2024-12-31")
    data['Daily_Return'] = data['Close'].pct_change()
    data['50_Day_MA'] = data['Close'].rolling(window=50).mean()
    data['200_Day_MA'] = data['Close'].rolling(window=200).mean()
    data['RSI'] = compute_rsi(data['Close'])
    data['Volatility'] = data['Daily_Return'].rolling(window=7).std()
    return data.dropna()

def compute_rsi(series, window=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

data = load_data()

X = data[['50_Day_MA', '200_Day_MA', 'RSI', 'Volatility']]
y = data['Close']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)

y_pred = xgb_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("XGBoost - MSE:", mse)
print("XGBoost - R²:", r2)
print("XGBoost - RMSE:", np.sqrt(mse))


In [None]:
y_train_pred = xgb_model.predict(X_train_scaled)
train_r2 = r2_score(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("Train R²:", train_r2)
print("Train RMSE:", train_rmse)


# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np


X = data[['50_Day_MA', '200_Day_MA', 'RSI', 'Volatility']]
y = data['Close']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

y_pred_rf = rf_model.predict(X_test_scaled)
y_train_pred_rf = rf_model.predict(X_train_scaled)


test_mse = mean_squared_error(y_test, y_pred_rf)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_pred_rf)

train_mse = mean_squared_error(y_train, y_train_pred_rf)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred_rf)

print(" Random Forest - Test Set")
print("R²:", test_r2)
print("RMSE:", test_rmse)

print(" Random Forest - Train Set")
print("R²:", train_r2)
print("RMSE:", train_rmse)


# **SVM **


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

features = ['50_Day_MA', '200_Day_MA', 'RSI', 'Volatility']
target = 'Close'

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
svm_model = SVR(kernel='rbf', C=100, epsilon=0.1)

svm_model.fit(X_train_scaled, y_train)


In [None]:
svm_model = SVR(kernel='rbf', C=100, epsilon=0.1)


svm_model.fit(X_train_scaled, y_train)


In [None]:
y_pred_test = svm_model.predict(X_test_scaled)
y_pred_train = svm_model.predict(X_train_scaled)


test_r2 = r2_score(y_test, y_pred_test)
test_rmse = mean_squared_error(y_test, y_pred_test) ** 0.5

train_r2 = r2_score(y_train, y_pred_train)
train_rmse = mean_squared_error(y_train, y_pred_train) ** 0.5

# Display results
print("SVR - Test Set")
print(f"R²: {test_r2}")
print(f"RMSE: {test_rmse}")

print(" SVR - Train Set")
print(f"R²: {train_r2}")
print(f"RMSE: {train_rmse}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

y_true = np.ravel(y_test)
y_pred = np.ravel(y_pred_test)

plt.figure(figsize=(7, 7))
plt.scatter(y_true, y_pred, alpha=0.6, color='dodgerblue')
plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--')
plt.title('SVR: Actual vs Predicted (Scatter Plot)')
plt.xlabel('Actual Close Prices')
plt.ylabel('Predicted Close Prices')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
residuals = y_true - y_pred

plt.figure(figsize=(10, 5))
plt.scatter(range(len(residuals)), residuals, alpha=0.6)
plt.hlines(0, xmin=0, xmax=len(residuals), colors='red', linestyles='--')

plt.ylabel('Residuals')
plt.grid(True)
plt.tight_layout()
plt.show()
