# NUMPY PROJECT

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

# Data Collection (from CSV)
def get_stock_data(file_path):
    data = pd.read_csv(file_path, index_col='Date', parse_dates=True, dayfirst=True)
    print("Data after loading from CSV:\n", data.head())
    return data

# Data Preprocessing
def preprocess_data(data):
    # Handle missing values
    data = data.dropna()
    print("Data after dropping NA:\n", data.head())
    
    # Normalize data
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(data)
    scaled_data = pd.DataFrame(scaled_data, columns=data.columns)
    print("Data after normalization:\n", scaled_data.head())
    
    return scaled_data, scaler

# Feature Engineering
def create_features(data):
    data['MA10'] = data['Close'].rolling(window=10).mean()
    data['MA50'] = data['Close'].rolling(window=50).mean()
    data = data.dropna()
    print("Data after feature engineering:\n", data.head())
    return data

# Model Selection and Training
def train_models(X_train, y_train):
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(),
        "Neural Network": MLPRegressor(hidden_layer_sizes=(50,50,50), max_iter=500)
    }
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        models[name] = model
        
    return models

# Model Evaluation
def evaluate_models(models, X_test, y_test):
    evaluation = {}
    for name, model in models.items():
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        evaluation[name] = mse
    return evaluation

# Prediction and Visualization
def predict_and_visualize(models, X_test, y_test, scaler, original_data):
    plt.figure(figsize=(14,7))
    for name, model in models.items():
        predictions = model.predict(X_test)
        predictions = scaler.inverse_transform(np.hstack((np.zeros((predictions.shape[0], original_data.shape[1] - 1)), predictions.reshape(-1, 1))))[:, -1]
        
        plt.plot(original_data.index[-len(predictions):], predictions, label=f'{name} Predictions')
    
    plt.plot(original_data['Close'], label='Actual Prices')
    plt.xlabel('Date')
    plt.ylabel('Stock Price')
    plt.legend()
    plt.title('Stock Price Prediction')
    plt.show()

# Future Prediction
def future_prediction(models, latest_data, scaler, future_periods=7):
    # Generate future dates for prediction
    future_dates = pd.date_range(start=latest_data.index[-1], periods=future_periods + 1, freq='D')[1:]
    
    # Create future features
    future_features = pd.DataFrame(index=future_dates)
    future_features['MA10'] = latest_data['Close'].rolling(window=10).mean().iloc[-1]
    future_features['MA50'] = latest_data['Close'].rolling(window=50).mean().iloc[-1]
    
    # Scale the features
    future_features_scaled = scaler.transform(future_features)
    
    # Make predictions for future periods
    future_predictions = {}
    for name, model in models.items():
        predictions = model.predict(future_features_scaled)
        predictions = scaler.inverse_transform(np.hstack((np.zeros((predictions.shape[0], latest_data.shape[1] - 1)), predictions.reshape(-1, 1))))[:, -1]
        future_predictions[name] = predictions
    
    return future_dates, future_predictions

# Main function
def main():
    # Parameters
    file_path = r'C:\Users\Abi Karimireddy\Downloads\AAPL(2).csv'  # Update this path to your CSV file
    
    # Data Collection
    data = get_stock_data(file_path)
    
    # Data Preprocessing
    preprocessed_data, scaler = preprocess_data(data)
    
    # Feature Engineering
    feature_data = create_features(preprocessed_data)
    
    # Preparing training and testing data
    X = feature_data.drop(['Close'], axis=1)
    y = feature_data['Close']
    
    # Ensure there is enough data to split
    if len(X) == 0 or len(y) == 0:
        print("No data available after preprocessing. Please check your dataset and preprocessing steps.")
        return
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Model Selection and Training
    models = train_models(X_train, y_train)
    
    # Model Evaluation
    evaluation = evaluate_models(models, X_test, y_test)
    for name, mse in evaluation.items():
        print(f'{name}: Mean Squared Error = {mse}')
    
    # Prediction and Visualization
    predict_and_visualize(models, X_test, y_test, scaler, data)
    
    # Future Prediction
    future_dates, future_predictions = future_prediction(models, data, scaler, future_periods=7)
    print("\nFuture Predictions:")
    for name, predictions in future_predictions.items():
        print(f'{name} Predictions for the next 7 days:', predictions)
    
    print("\nFuture Dates for Prediction:", future_dates)

if __name__ == "__main__":
    main()



Data after loading from CSV:
                   Open        High         Low       Close   Adj Close  \
Date                                                                     
2024-05-15  187.910004  190.649994  187.369995  189.720001  189.720001   
2024-05-16  190.470001  191.100006  189.660004  189.839996  189.839996   
2024-05-17  189.509995  190.809998  189.179993  189.869995  189.869995   
2024-05-20  189.330002  191.919998  189.009995  191.039993  191.039993   
2024-05-21  191.089996  192.729996  190.919998  192.350006  192.350006   

              Volume  
Date                  
2024-05-15  70400000  
2024-05-16  52845200  
2024-05-17  41282900  
2024-05-20  44361300  
2024-05-21  42309400  
Data after dropping NA:
                   Open        High         Low       Close   Adj Close  \
Date                                                                     
2024-05-15  187.910004  190.649994  187.369995  189.720001  189.720001   
2024-05-16  190.470001  191.100006  189.660

# Process Explanation

## Data Loading and Preprocessing

### Data Loading
- The data is loaded from a CSV file using pandas' `read_csv()` function.
- The `index_col='Date'` parameter specifies that the 'Date' column should be used as the index.
- The `parse_dates=True` parameter ensures that dates are parsed as datetime objects.
- The `dayfirst=True` parameter specifies that the day comes before the month in the date format.

### Data Preprocessing
- Missing values are handled by dropping rows with NaN values using the `dropna()` function.
- The data is normalized using Min-Max scaling to scale all features to a range between 0 and 1.
- The MinMaxScaler from sklearn.preprocessing is used for normalization.

## Feature Engineering
- Two moving averages (MA10 and MA50) are calculated as additional features.
- Moving averages are calculated using the `rolling()` function followed by `mean()`.
- Rows with NaN values resulting from the feature calculation are dropped.

## Model Selection and Training

### Models Used
- Three regression models are selected: Linear Regression, Decision Tree Regressor, and MLP Regressor (Neural Network).

### Training
- The selected models are trained using the training data.
- Training is done using the `fit()` function.

## Model Evaluation

### Evaluation Metric
- Mean Squared Error (MSE) is used as the evaluation metric to assess the performance of each model.

### Evaluation Process
- The trained models are evaluated using the test data.
- Mean Squared Error (MSE) is calculated using the `mean_squared_error()` function from sklearn.metrics.

## Prediction and Visualization

### Prediction
- The trained models are used to make predictions on the test data.
- Predictions are made using the `predict()` function for each model.

### Visualization
- Predicted stock prices are plotted along with actual prices for visualization.
- Matplotlib is used for data visualization.

# Result Analysis

## Data after loading from CSV:
| Date       | Open       | High       | Low        | Close      | Adj Close  | Volume    |
|------------|------------|------------|------------|------------|------------|-----------|
| 2024-05-15 | 187.910004 | 190.649994 | 187.369995 | 189.720001 | 189.720001 | 70400000  |
| 2024-05-16 | 190.470001 | 191.100006 | 189.660004 | 189.839996 | 189.839996 | 52845200  |
| 2024-05-17 | 189.509995 | 190.809998 | 189.179993 | 189.869995 | 189.869995 | 41282900  |
| 2024-05-20 | 189.330002 | 191.919998 | 189.009995 | 191.039993 | 191.039993 | 44361300  |
| 2024-05-21 | 191.089996 | 192.729996 | 190.919998 | 192.350006 | 192.350006 | 42309400  |

## Data after dropping NA:
| Date       | Open       | High       | Low        | Close      | Adj Close  | Volume    |
|------------|------------|------------|------------|------------|------------|-----------|
| 2024-05-15 | 187.910004 | 190.649994 | 187.369995 | 189.720001 | 189.720001 | 70400000  |
| 2024-05-16 | 190.470001 | 191.100006 | 189.660004 | 189.839996 | 189.839996 | 52845200  |
| 2024-05-17 | 189.509995 | 190.809998 | 189.179993 | 189.869995 | 189.869995 | 41282900  |
| 2024-05-20 | 189.330002 | 191.919998 | 189.009995 | 191.039993 | 191.039993 | 44361300  |
| 2024-05-21 | 191.089996 | 192.729996 | 190.919998 | 192.350006 | 192.350006 | 42309400  |

## Data after normalization:
| Date       | Open       | High       | Low        | Close      | Adj Close  | Volume    |
|------------|------------|------------|------------|------------|------------|-----------|
| 0.000000   | 0.031246   | 0.172492   | 0.519195   | 0.519195   | 1.000000   |
| 0.587155   | 0.232144   | 0.706295   | 0.541132   | 0.541132   | 0.508977   |
| 0.366970   | 0.102677   | 0.594404   | 0.546616   | 0.546616   | 0.185570   |
| 0.325688   | 0.598211   | 0.554777   | 0.760510   | 0.760510   | 0.271675   |
| 0.729356   | 0.959817   | 1.000000   | 1.000000   | 1.000000   | 0.214282   |

## Data after feature engineering:
Empty DataFrame
Columns: [Open, High, Low, Close, Adj Close, Volume, MA10, MA50]
Index: []
No data available after preprocessing. Please check your dataset and preprocessing steps.

## Model Performance:
- The Mean Squared Error (MSE) is calculated for each model on the test data.
- Lower MSE values indicate better performance of the model.

### Linear Regression:
- Mean Squared Error = [MSE value]

### Decision Tree Regressor:
- Mean Squared Error = [MSE value]

### MLP Regressor (Neural Network):
- Mean Squared Error = [MSE value]

## Visualization:
- Actual stock prices and predicted prices are plotted to visually assess the accuracy of the models.

## Future Prediction:
- Future stock prices are predicted for the next 7 days using the trained models.
- Future dates for prediction are [Future Dates].
- Predictions for each model:

### Linear Regression:
- Predictions for the next 7 days: [Predicted Prices]

### Decision Tree Regressor:
- Predictions for the next 7 days: [Predicted Prices]

### MLP Regressor (Neural Network):
- Predictions for the next 7 days: [Predicted Prices]
