In [5]:
import numpy as np
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ta

In [23]:
# Load the dataset
def get_X_and_y(include_date=False,filename="nabil.csv"):
    #################################
    ################################33
    #X will have features
                        #[['Open', 'High', 'Low', 'Volume', 'MA_10', 
                        #'Bollinger_Middle', 'Bollinger_Upper', 'Bollinger_Lower', 'MACD', 'Signal_Line', 'ADX']]
    ##############################
    data = pd.read_csv('nabil.csv')

    # Convert 'Date' column to datetime
    data['Date'] = pd.to_datetime(data['Date'])
    
    # Sort the DataFrame by 'Date' in ascending order
    data.sort_values(by='Date', ascending=True, inplace=True)
    
    # Set the 'Date' column as the index with integer values
    data.set_index(data.index, inplace=True)
    
    
    ##data cleaning
    
    data['Percent Change'] = pd.to_numeric(data['Percent Change'].str.replace('%', '', regex=False), errors='coerce')
    #Find rows with NaN values in 'Percent Change' column
    rows_with_nan = data[data['Percent Change'].isna()]
    
    # Calculate and replace NaN values with the formula=%change=(old_closng-new_closing/old_closing) mentioned
    first=False
    for index, row in rows_with_nan.iterrows():
        if  first is False:
            first=True
            continue
        prev_row = data.loc[index - 1]
        data.at[index, 'Percent Change'] = (row['Close'] - prev_row['Close']) / row['Close']

    
    # Clean 'Volume' column (remove commas and convert to float)
    data['Volume'] = data['Volume'].str.replace(',', '').astype(float)
    #1        #tomorrow going to raise?
    #2        # nuber of holidays before today price(heiristics, sunday is usually more raise due to saturday)
    #3        #technical indicators=[moving average, RSI, MACD, bollinger band, average directional index(ADX)]
    
    # Shift the 'Close' column one row down to get tomorrow's price
    data['Next_Day_Close'] = data['Close'].shift(-1)
    
    # Create a new column 'Is_Tomorrow_High' bon
    data['Is_Tomorrow_High'] = (data['Next_Day_Close'] > data['Close']).astype(int)
    
    # Drop the 'Next_Day_Close' column if you no longer need it
    data.drop('Next_Day_Close', axis=1, inplace=True)
    
    # Calculate the number of days between current row and previous row's date, excluding weekends
    data['Days_Before_Holidays'] = (data['Date'] - data['Date'].shift(1)).dt.days-1
    
    # Fill potential NaN values in the first row with 0
    data['Days_Before_Holidays'].fillna(0, inplace=True)
    
    # Calculate Moving Averages (e.g., 10-day and 50-day)
    data['MA_10'] = ta.trend.sma_indicator(data['Close'], window=10)
    
    # Calculate Bollinger Bands
    bollinger = ta.volatility.BollingerBands(data['Close'])
    data['Bollinger_Middle'] = bollinger.bollinger_mavg()
    data['Bollinger_Upper'] = bollinger.bollinger_hband()
    data['Bollinger_Lower'] = bollinger.bollinger_lband()
    
    # Calculate MACD
    macd = ta.trend.MACD(data['Close'])
    data['MACD'] = macd.macd()
    data['Signal_Line'] = macd.macd_signal()
    
    # Calculate Average Directional Index (ADX)
    data['ADX'] = ta.trend.ADXIndicator(data['High'], data['Low'], data['Close']).adx()
    data['ADX']=data['ADX'].replace(0, np.nan)
    # Find the first non-NaN value in each column
    first_non_nan = data.ffill().iloc[-1]
    
    # Replace NaN values in columns 0 to i-1 with the first non-NaN value in the corresponding column
    for col in range(data.shape[1]):
        data.iloc[:, col] = data.iloc[:, col].fillna(first_non_nan[col])
    data.isnull().sum()
    
    from sklearn.model_selection import train_test_split
    # Define the columns for X (features) and y (target)
    if include_date:
        X_columns = ['Date','Open', 'High', 'Low', 'Volume', 'MA_10', 'Bollinger_Middle', 'Bollinger_Upper', 'Bollinger_Lower', 'MACD', 'Signal_Line', 'ADX']
    else:
        X_columns = ['Open', 'High', 'Low', 'Volume', 'MA_10', 'Bollinger_Middle', 'Bollinger_Upper', 'Bollinger_Lower', 'MACD', 'Signal_Line', 'ADX']
    
    y_column = 'Close'
    
    # Create X (features) and y (target)
    X = data[X_columns]
    y = data[y_column]
    
    return X,y
    


In [24]:
X,y=get_X_and_y(include_date=True)
X.head(40)

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


Unnamed: 0,Date,Open,High,Low,Volume,MA_10,Bollinger_Middle,Bollinger_Upper,Bollinger_Lower,MACD,Signal_Line,ADX
2205,2014-01-01,2210.0,2180.0,2142.0,433.0,591.93,598.055,613.631197,582.478803,-4.659392,-3.511786,13.689786
2204,2014-01-02,2151.0,2150.0,2090.0,10110.0,591.93,598.055,613.631197,582.478803,-4.659392,-3.511786,13.689786
2203,2014-01-05,2125.0,2125.0,2100.0,5113.0,591.93,598.055,613.631197,582.478803,-4.659392,-3.511786,13.689786
2202,2014-01-06,2125.0,2118.0,2095.0,3381.0,591.93,598.055,613.631197,582.478803,-4.659392,-3.511786,13.689786
2201,2014-01-07,2110.0,2135.0,2070.0,4731.0,591.93,598.055,613.631197,582.478803,-4.659392,-3.511786,13.689786
2200,2014-01-08,2126.0,2180.0,2100.0,2502.0,591.93,598.055,613.631197,582.478803,-4.659392,-3.511786,13.689786
2199,2014-01-09,2168.0,2172.0,2125.0,1115.0,591.93,598.055,613.631197,582.478803,-4.659392,-3.511786,13.689786
2198,2014-01-12,2172.0,2225.0,2105.0,1788.0,591.93,598.055,613.631197,582.478803,-4.659392,-3.511786,13.689786
2197,2014-01-13,2225.0,2260.0,2225.0,1385.0,591.93,598.055,613.631197,582.478803,-4.659392,-3.511786,13.689786
2196,2014-01-14,2250.0,2250.0,2220.0,287.0,2167.2,598.055,613.631197,582.478803,-4.659392,-3.511786,13.689786
