<a href="https://colab.research.google.com/github/Deyonrose/S5_PredictiveAnalysis/blob/main/2348513_DEYON_PA_TTIMESERIES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [10]:
# Load data and parse dates
df = pd.read_csv('TATAMOTORS.csv', parse_dates=['Date'])
df.set_index('Date', inplace=True)
df.sort_index(inplace=True)


In [3]:
df.head()

Unnamed: 0,Date,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
0,2000-01-03,TELCO,EQ,201.6,207.4,217.25,207.4,217.0,216.75,214.28,676126,14487750000000.0,,,
1,2000-01-04,TELCO,EQ,216.75,217.0,219.0,206.0,211.9,208.2,209.5,679215,14229620000000.0,,,
2,2000-01-05,TELCO,EQ,208.2,194.0,217.8,194.0,213.1,213.25,210.33,1120951,23576840000000.0,,,
3,2000-01-06,TELCO,EQ,213.25,215.0,229.9,215.0,222.0,222.1,225.29,1968998,44359320000000.0,,,
4,2000-01-07,TELCO,EQ,222.1,224.0,239.9,223.1,239.9,239.9,236.32,2199431,51976360000000.0,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5306 entries, 0 to 5305
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                5306 non-null   object 
 1   Symbol              5306 non-null   object 
 2   Series              5306 non-null   object 
 3   Prev Close          5306 non-null   float64
 4   Open                5306 non-null   float64
 5   High                5306 non-null   float64
 6   Low                 5306 non-null   float64
 7   Last                5306 non-null   float64
 8   Close               5306 non-null   float64
 9   VWAP                5306 non-null   float64
 10  Volume              5306 non-null   int64  
 11  Turnover            5306 non-null   float64
 12  Trades              2456 non-null   float64
 13  Deliverable Volume  4792 non-null   float64
 14  %Deliverble         4792 non-null   float64
dtypes: float64(11), int64(1), object(3)
memory usage: 621.9

In [5]:
df.isnull().sum()

Unnamed: 0,0
Date,0
Symbol,0
Series,0
Prev Close,0
Open,0
High,0
Low,0
Last,0
Close,0
VWAP,0


In [8]:
# Define a custom function for each preprocessing step
def smooth_close(series, window=7):
    return series.rolling(window=window).mean()

def detect_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return np.where((series < lower_bound) | (series > upper_bound), np.nan, series)

def fill_missing_values(series, method='ffill'):
    return series.fillna(method=method)

# Define the pipeline steps
preprocessing_pipeline = Pipeline([
    ('smooth_close', ('Close', lambda df: smooth_close(df['Close']))),  # Smooth 'Close' column
    ('outlier_removal_volume', ('Volume', lambda df: detect_outliers(df['Volume']))),  # Remove outliers in 'Volume'
    ('fill_missing_trades', ('Trades', lambda df: fill_missing_values(df['Trades'], method='ffill'))),  # Fill missing 'Trades' column
    ('scaling', StandardScaler())  # Scale the data (for all columns)
])

# Apply pipeline transformations
df['Close_smooth'] = smooth_close(df['Close'])          # Apply smoothing
df['Volume_clean'] = detect_outliers(df['Volume'])      # Remove outliers in Volume
df['Trades_filled'] = fill_missing_values(df['Trades']) # Handle missing Trades

# Apply scaling (Z-score normalization)
scaler = StandardScaler()
df[['Open', 'High', 'Low', 'Close_smooth', 'Volume_clean']] = scaler.fit_transform(
    df[['Open', 'High', 'Low', 'Close_smooth', 'Volume_clean']]
)

# Display final dataset
print(df[['Open', 'High', 'Low', 'Close_smooth', 'Volume_clean', 'Trades_filled']].head(10))

                Open      High       Low  Close_smooth  Volume_clean  \
Date                                                                   
2000-01-03 -0.742845 -0.721580 -0.726781           NaN     -0.984748   
2000-01-04 -0.707673 -0.715263 -0.732005           NaN     -0.983988   
2000-01-05 -0.791940 -0.719595 -0.776780           NaN     -0.875203   
2000-01-06 -0.715000 -0.675911 -0.698423           NaN     -0.666357   
2000-01-07 -0.682026 -0.639809 -0.668199           NaN     -0.609608   
2000-01-10 -0.553793 -0.570493 -0.575290           NaN     -0.727543   
2000-01-11 -0.550129 -0.567244 -0.612043     -0.666879     -0.853786   
2000-01-12 -0.623405 -0.617787 -0.646185     -0.654541     -0.920115   
2000-01-13 -0.612414 -0.621397 -0.636483     -0.640233     -1.011929   
2000-01-14 -0.641908 -0.628618 -0.631259     -0.626687     -0.898900   

            Trades_filled  
Date                       
2000-01-03            NaN  
2000-01-04            NaN  
2000-01-05            N

  return series.fillna(method=method)
