In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
import xgboost as xgb
import warnings 
warnings.filterwarnings("ignore")


import os
import random
import tensorflow as tf

# 1. Set Python seed
random.seed(42)

# 2. Set NumPy seed
np.random.seed(42)

# 3. Set TensorFlow seed
tf.random.set_seed(42)

In [3]:
try:
    raw_df = pd.read_csv(r'D:\Cloudly IO\Tesla-Forcasting-Project\data\raw\Tasla_Stock_Updated_V2.csv')
    # display(raw_df.head())
    back_up = raw_df
except FileNotFoundError:
    print("Error: 'Tasla_Stock_Updated_V2.csv' not found.")
except Exception as e:
    print(f"An error occurred: {e}")


In [4]:
raw_df

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume
0,0,2015-01-02,14.858000,14.883333,14.217333,14.620667,71466000
1,1,2015-01-05,14.303333,14.433333,13.810667,14.006000,80527500
2,2,2015-01-06,14.004000,14.280000,13.614000,14.085333,93928500
3,3,2015-01-07,14.223333,14.318667,13.985333,14.063333,44526000
4,4,2015-01-08,14.187333,14.253333,14.000667,14.041333,51637500
...,...,...,...,...,...,...,...
2269,2269,2024-01-09,238.110001,238.960007,232.039993,234.960007,96705700
2270,2270,2024-01-10,235.100006,235.500000,231.289993,233.940002,91628500
2271,2271,2024-01-11,230.570007,230.929993,225.369995,227.220001,105873600
2272,2272,2024-01-12,220.080002,225.339996,217.149994,218.889999,122889000


In [5]:
raw_df.info() #or raw_df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2274 entries, 0 to 2273
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2274 non-null   int64  
 1   Date        2274 non-null   object 
 2   Open        2274 non-null   float64
 3   High        2274 non-null   float64
 4   Low         2274 non-null   float64
 5   Close       2274 non-null   float64
 6   Volume      2274 non-null   int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 124.5+ KB


In [6]:
print(raw_df.isnull().sum())

Unnamed: 0    0
Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
dtype: int64


In [7]:
num_duplicates = raw_df.duplicated().sum()
print(f"\n🔹 Number of duplicate rows: {num_duplicates}")

categorical_cols = raw_df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = raw_df.select_dtypes(include=['number']).columns.tolist()

print("\n🔹 Categorical columns:", categorical_cols)
print("🔹 Numerical columns:", numerical_cols)


🔹 Number of duplicate rows: 0

🔹 Categorical columns: ['Date']
🔹 Numerical columns: ['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Volume']


In [8]:
raw_df.describe()

Unnamed: 0.1,Unnamed: 0,Open,High,Low,Close,Volume
count,2274.0,2274.0,2274.0,2274.0,2274.0,2274.0
mean,1136.5,103.49373,105.771617,101.036327,103.461794,114208900.0
std,656.591578,111.136174,113.606095,108.399966,111.032019,76884300.0
min,0.0,9.488,10.331333,9.403333,9.578,10620000.0
25%,568.25,16.669833,16.933332,16.468833,16.693666,66377620.0
50%,1136.5,23.221334,23.554,22.842334,23.197333,93598550.0
75%,1704.75,215.272503,219.629173,208.983334,215.227497,133821400.0
max,2273.0,411.470001,414.496674,405.666656,409.970001,914082000.0


In [9]:
def clean_data(ddf):
  # Drop unnamed index column if present
  ddf = ddf.loc[:, ~ddf.columns.str.contains('^Unnamed')]

  # Convert 'Date' to datetime format
  ddf['Date'] = pd.to_datetime(ddf['Date'])

  # Set 'Date' as the index
  ddf.set_index('Date', inplace=True)

  # Handle missing values if any
  ddf = ddf.dropna()
  return ddf


In [10]:
clean_df = clean_data(raw_df)

In [11]:
def feature_engineering(ddf):
  # Daily returns
  ddf['Daily_Return'] = ddf['Close'].pct_change()

  # Moving averages
  ddf['MA5'] = ddf['Close'].rolling(window=5).mean()
  ddf['MA10'] = ddf['Close'].rolling(window=10).mean()

  # Volatility (rolling standard deviation)
  ddf['Volatility'] = ddf['Close'].rolling(window=5).std()

  ddf['Target'] = ddf['Close'].shift(-1)

  # Drop initial NaNs
  ddf.dropna(inplace=True)
  return ddf


In [12]:
split_idx = int(len(clean_df)* 0.8)
train_df, test_df = clean_df.iloc[:split_idx], clean_df.iloc[split_idx:]

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

#Saving train and test data
train_df.to_csv(r'D:\Cloudly IO\Tesla-Forcasting-Project\data\preprocessed\Tasla_Stock_Updated_train.csv')
test_df.to_csv(r'D:\Cloudly IO\Tesla-Forcasting-Project\data\preprocessed\Tasla_Stock_Updated_test.csv')
