In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/nvidia2/NVDIA.csv')

In [None]:
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')


In [None]:
filtered_data = data[(data['Date'] >= '2018-01-01') & (data['Date'] <= '2022-12-31')]

In [None]:
# Sort data by Date
filtered_data = filtered_data.sort_values('Date')

# Feature Engineering
# Adding Moving Averages (10-day and 50-day)
filtered_data['MA_10'] = filtered_data['Close'].rolling(window=10).mean()
filtered_data['MA_50'] = filtered_data['Close'].rolling(window=50).mean()

# Adding Lag Features
filtered_data['Lag_1'] = filtered_data['Close'].shift(1)
filtered_data['Lag_2'] = filtered_data['Close'].shift(2)
filtered_data['Lag_3'] = filtered_data['Close'].shift(3)
filtered_data['Lag_5'] = filtered_data['Close'].shift(5)
filtered_data['Lag_10'] = filtered_data['Close'].shift(10)
filtered_data['Lag_20'] = filtered_data['Close'].shift(20)

filtered_data['DayOfWeek'] = filtered_data['Date'].dt.dayofweek
filtered_data['Month'] = filtered_data['Date'].dt.month
filtered_data['DayOfYear'] = filtered_data['Date'].dt.dayofyear

# Adding Volatility Features
filtered_data['Return_1D'] = filtered_data['Close'].pct_change()
filtered_data['Return_5D'] = filtered_data['Close'].pct_change(5)
filtered_data['Return_10D'] = filtered_data['Close'].pct_change(10)
filtered_data['MA_Return_5D'] = filtered_data['Return_1D'].rolling(window=5).mean()
filtered_data['MA_Return_10D'] = filtered_data['Return_1D'].rolling(window=10).mean()
filtered_data['MA_Return_20D'] = filtered_data['Return_1D'].rolling(window=20).mean()




In [None]:
filtered_data = filtered_data.dropna()


In [None]:
X = filtered_data[['Lag_1', 'Lag_2', 'Lag_3', 'Lag_5', 'Lag_10', 'Lag_20', 
                   'MA_10', 'MA_50', 
                   'DayOfWeek', 'Month', 'DayOfYear', 
                   'Return_1D', 'Return_5D', 'Return_10D', 
                   'MA_Return_5D', 'MA_Return_10D', 'MA_Return_20D']]
y = filtered_data['Close']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)


In [None]:
tscv = TimeSeriesSplit(n_splits=5)

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=tscv, scoring='neg_mean_squared_error')
X_train_recent = X_train[-500:]  # Last 500 data points
y_train_recent = y_train[-500:]
grid_search.fit(X_train_recent, y_train_recent)

# Best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')


In [None]:
# Plot Actual vs Predicted values
plt.figure(figsize=(10, 6))
plt.plot(filtered_data['Date'].iloc[-len(y_test):], y_test, label='Actual Close Price', color='blue')
plt.plot(filtered_data['Date'].iloc[-len(y_test):], y_pred, label='Predicted Close Price', color='orange')
plt.title('Actual vs Predicted Nvidia Stock Price')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.legend()
plt.show()

In [None]:
#feature importancee graph 
importance = best_model.feature_importances_
features = X.columns
plt.figure(figsize=(8, 6))
sns.barplot(x=importance, y=features)
plt.title('Feature Importance')
plt.show()