In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt

def tsa_analysis():
    def process_data():
        path = "Enter file path over here"
        df = pd.read_csv(path)
        df.dropna(inplace = True)
#         df = sns.load_dataset('flights')
        print(df.head(4))

        opt = 0
        while opt <= 0 or opt > 3:
            try:
                opt = int(input('Enter 1 for date in one column and 2 for date as month and year in different columns and 3 for month-year: '))
            except ValueError:
                print("Invalid input. Please enter 1 or 2 or 3.")

        if opt == 1:
            col_name = input('Enter Dates column name: ')
            target = input('Enter target variable: ')
            df[col_name] = pd.to_datetime(df[col_name])
            df.set_index(col_name, inplace=True)
            df = df.resample('M').pad()
        elif opt == 2:
            col_name_month = input('Enter Months column name: ')
            col_name_year = input('Enter Years column name: ')
            target = input('Enter target variable: ')
            df['yearMonth'] = '01-' + df[col_name_month].astype(str) + '-' + df[col_name_year].astype(str)
            df['yearMonth'] = pd.to_datetime(df['yearMonth'])
            df.set_index('yearMonth', inplace=True)
            df = df.resample('M').pad()
        elif opt == 3:
            col_name_month = input('Enter Months_Year column name: ')
            target = input('Enter target variable: ')
            df['yearMonth'] = pd.to_datetime(df[col_name_month])
            df.set_index('yearMonth', inplace=True)
            df = df.resample('M').pad()

        return df, target
    
    def apply_transformation(df, target, transform_func, transform_name):
        df[transform_name] = transform_func(df[target])
        test_stationarity(df, transform_name, transform_name)
        p_value = adfuller(df[transform_name], autolag='AIC')[1]
        return df, p_value

    def log_transform(data):
        return np.log(data)

    def sqrt_transform(data):
        return np.sqrt(data)

    def cbrt_transform(data):
        return np.cbrt(data)

    def test_stationarity(df, var, transform_name):
        df['rollMean'] = df[var].rolling(window=12).mean()
        df['rollStd'] = df[var].rolling(window=12).std()
        
        dftest = adfuller(df[var], autolag='AIC')
        dfoutput = pd.Series(dftest[0:4], index=['Test Statistics', 'p-value', '#lags used', 'number of observations used'])
        for key, value in dftest[4].items():
            dfoutput[f'critical value ({key})'] = value
        print(dfoutput)
        
        plt.figure(figsize=(5, 4))
        sns.lineplot(data=df, x=df.index, y=transform_name)
        sns.lineplot(data=df, x=df.index, y='rollMean')
        sns.lineplot(data=df, x=df.index, y='rollStd')
        plt.show();

    def create_shifted_diff(df, col_name):
        df['shift'] = df[col_name].shift()
        df['shiftDiff'] = df[col_name] - df['shift']
        return df.dropna()
    
    def create_shifted_diff12(df, col_name):
        df['shift12'] = df[col_name].diff(12)
        return df.dropna()

    def plot_acf_pacf(data, lags=20):
        fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))
        plot_acf(data, lags=lags, ax=ax[0], title='ACF')
        plot_pacf(data, lags=lags, ax=ax[1], title='PACF')
        plt.show();
    
    def arima_model(train, test, order):
        model = ARIMA(train, order=order)
        model_fit = model.fit()
        prediction = model_fit.predict(start=test.index[0], end=test.index[-1])
        return prediction

    def sarima_model(train, test, order, seasonal_order):
        model = SARIMAX(train, order=order, seasonal_order=seasonal_order)
        model_fit = model.fit()
        prediction = model_fit.predict(start=test.index[0], end=test.index[-1])
        return prediction
    
    
    # Handle missing values before creating the 'shiftDiff' variable
    df, target = process_data()
    df.dropna(subset=[target], inplace=True)

    transformed_df = create_shifted_diff(df, target)
    test_stationarity(transformed_df, 'shiftDiff', 'shiftDiff')
    p_value = adfuller(transformed_df['shiftDiff'], autolag='AIC')[1]

    # Apply transformations based on the stationarity of the shifted difference
    if p_value >= 0.05:
        transformed_df['shiftDiff'] = transformed_df['shiftDiff'].apply(lambda x: max(x, 0.001))  # Replace negative values with a small positive value

        transformed_df, p_value = apply_transformation(transformed_df, 'shiftDiff', log_transform, 'log')
        if p_value >= 0.05:
            transformed_df, p_value = apply_transformation(transformed_df, 'shiftDiff', sqrt_transform, 'sqrt')
            if p_value >= 0.05:
                transformed_df, p_value = apply_transformation(transformed_df, 'shiftDiff', cbrt_transform, 'cbrt')

    # Test stationarity and plot ACF and PACF
    test_stationarity(transformed_df, 'shiftDiff', 'shiftDiff')
    plot_acf_pacf(transformed_df['shiftDiff'].dropna())
    
    transformed_df = create_shifted_diff12(df, target)
    plot_acf_pacf(transformed_df['shift12'].dropna())

    
    pdq = tuple(map(int, input("Enter pdq values (comma-separated): ").split(',')))
    PDQS = tuple(map(int, input("Enter PDQS values (comma-separated): ").split(',')))
    
    # ARIMA and SARIMA models
    train = transformed_df[[target]][:round(len(transformed_df)*70/100)]
    test = transformed_df[[target]][round(len(transformed_df)*70/100):]

    order_arima = pdq
    seasonal_order_sarima = PDQS
    prediction_arima = arima_model(train, test, order_arima)
    prediction_sarima = sarima_model(train, test, order_arima, seasonal_order_sarima)

    # Visualize predictions and compare them to the original data
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=train, x=train.index, y=target, label='Train Data')
    sns.lineplot(data=test, x=test.index, y=target, label='Test Data')
    sns.lineplot(data=test, x=test.index, y=prediction_arima, label='ARIMA Prediction')
    sns.lineplot(data=test, x=test.index, y=prediction_sarima, label='SARIMA Prediction')
    plt.legend()
    plt.show()

    # Calculate RMSE for both ARIMA and SARIMA predictions
    rmse_arima = np.sqrt(mean_squared_error(test[target], prediction_arima))
    rmse_sarima = np.sqrt(mean_squared_error(test[target], prediction_sarima))
    print(f"RMSE for ARIMA: {rmse_arima}")
    print(f"RMSE for SARIMA: {rmse_sarima}")

tsa_analysis()