In [150]:
import pandas as pd

In [151]:
ireland = pd.read_csv('Ireland.csv')
poland = pd.read_csv ('Poland.csv')



In [152]:
print(poland.head(30))

   year  january  february  march  april    may   june   july  august  \
0  2014    23.08     22.17  23.45  22.57  23.54  21.70  21.73   21.75   
1  2015    22.49     21.57  24.29  23.60  24.77  23.28  23.02   22.82   
2  2016    23.36     23.41  24.72  24.55  25.44  24.38  25.43   24.24   
3  2017    25.71     23.55  26.48  25.11  26.82  25.80  25.29   24.95   
4  2018    25.06     23.15  24.39  25.26  26.76  25.97  26.38   26.31   
5  2019    26.51     25.20  28.15  26.52  28.46  27.35  28.10   28.30   
6  2020    28.53     27.54  17.75   6.10   9.80  14.55  18.89   20.14   
7  2021    13.59     14.25  15.23  13.97  18.93  21.10  23.52   24.91   

   september  october  november  december  
0      22.42    23.89     22.41     20.36  
1      22.22    25.38     24.53     22.32  
2      23.32    25.68     24.61     23.40  
3      24.14    26.55     24.88     24.28  
4      25.91    28.47     27.22     25.41  
5      28.82    31.19     29.39     27.89  
6      21.61    18.54     12.42   

In [153]:
# Setting 'year' as index
poland.set_index('year', inplace=True)
## By setting the 'year' column as the index, it enhances the clarity and usability of my dataset.
# This ensures that each row is uniquely identified by the year, facilitating easy access and retrieval of data.

In [154]:
years_to_remove = [2014, 2015, 2016, 2017]
poland.drop(years_to_remove, inplace=True)
# By removing unnecessary years, I am cleaning my data and making sure this data will give me same years as my other country data

In [228]:
print(ireland.head(5))

  STATISTIC Statistic Label  TLIST(A1)  Year C01885V02316     Month    UNIT  \
0  TOA11C01        Red line       2018  2018           01   January  Number   
1  TOA11C01        Red line       2018  2018           02  February  Number   
2  TOA11C01        Red line       2018  2018           03     March  Number   
3  TOA11C01        Red line       2018  2018           04     April  Number   
4  TOA11C01        Red line       2018  2018           05       May  Number   

     VALUE  
0  1664495  
1  1657562  
2  1661098  
3  1794424  
4  1934553  


In [156]:
# Filtering the dataset to drop rows with 'Red line' or 'Green line' in the 'Statistic Label' column
ireland_filtered = ireland.drop(ireland[ireland['Statistic Label'].isin(['Red line', 'Green line'])].index)
# Justification:
# By removing rows with 'Red line' or 'Green line' in the 'Statistic Label' column,
# I ensure that my dataset contains only relevant information for my analysis.
# This helps in focusing on the data that is pertinent to my research objectives
# and improves the accuracy and reliability of my analysis results.

In [157]:
ireland_filtered.set_index('Year', inplace=True)


In [158]:
# Creating a pivot table with 'Month' as columns and 'VALUE' as values
ireland_pivot = ireland.pivot(columns='Month', values='VALUE')
# Justification:
# By creating a pivot table with 'Month' as columns and 'VALUE' as values,
# I restructure my dataset to better understand the variation of values across different months.
# This facilitates easier comparison and analysis of data trends over time,
# enabling me to gain insights into seasonal patterns and fluctuations in the dataset.

In [159]:
# Changing column names to lowercase
ireland_pivot.columns = ireland_pivot.columns.str.lower()
# Justification:
# By converting column names to lowercase, I ensure consistency in the naming convention
# and improve the readability of my dataset. This makes it easier to work with the data
# and avoids potential errors or confusion that may arise from inconsistent naming conventions.

In [160]:
ireland_pivot.columns = [
    'january', 'february', 'march', 'april', 'may', 'june',
    'july', 'august', 'september', 'october', 'november', 'december', 'all_months'
]


In [161]:
# Listing of columns to drop
columns_to_drop = ['STATISTIC', 'Statistic Label', 'TLIST(A1)', 'C01885V02316', 'UNIT']

# Dropping the columns
ireland_filtered.drop(columns_to_drop, axis=1, inplace=True)
# Justification:
# The listed columns contain irrelevant or redundant information for my analysis.
# By dropping these columns, I streamline my dataset and focus on the essential
# variables that are pertinent to my research objectives. This improves the clarity
# and efficiency of my analysis by reducing noise and unnecessary complexity in the dataset.

In [162]:
ireland_filtered = ireland_filtered[ireland_filtered['Month'] != 'All months']


In [163]:
print(ireland_filtered.head(25))

          Month    VALUE
Year                    
2018    January  3218649
2018   February  3118289
2018      March  3097570
2018      April  3350056
2018        May  3622338
2018       June  3378751
2018       July  3490513
2018     August  3363274
2018  September  3487373
2018    October  3919977
2018   November  4017567
2018   December  3772612
2019    January  3658484
2019   February  3727609
2019      March  4069124
2019      April  3826763
2019        May  4003472
2019       June  3750724
2019       July  4130610
2019     August  3781395
2019  September  4166802
2019    October  4502319
2019   November  4496717
2019   December  4233212
2020    January  3799176


In [164]:
ireland_pivot = ireland_filtered.pivot_table(index='Year', columns='Month', values='VALUE', aggfunc='first')


In [181]:
# Dropping the year 2022 from ireland_pivot dataset
ireland_pivot.drop(2022, inplace=True)


In [192]:
# Defining the correct order of months
months_order = ['January', 'February', 'March', 'April', 'May', 'June',
                'July', 'August', 'September', 'October', 'November', 'December']

# Reindexing the ireland_pivot DataFrame to match the correct order of months
ireland_pivot = ireland_pivot.reindex(months_order, axis=1)



In [193]:
print(ireland_pivot.head())

      January  February    March    April      May     June     July   August  \
Year                                                                            
2018  3218649   3118289  3097570  3350056  3622338  3378751  3490513  3363274   
2019  3658484   3727609  4069124  3826763  4003472  3750724  4130610  3781395   
2020  3799176   3770492  1950743   228633   368914   762376  1293479  1469350   
2021   628704    643480   836934   940426  1366567  1712148  1821536  1883500   

      September  October  November  December  
Year                                          
2018    3487373  3919977   4017567   3772612  
2019    4166802  4502319   4496717   4233212  
2020    1534789  1325561   1024219   1648324  
2021    2222412  2525575   2646334   2253703  


In [175]:
print(poland.head())

      january  february    march    april      may     june     july   august  \
year                                                                            
2018  2506000   2315000  2439000  2526000  2676000  2597000  2638000  2631000   
2019  2651000   2520000  2815000  2652000  2846000  2735000  2810000  2830000   
2020  2853000   2754000  1775000   610000   980000  1455000  1889000  2014000   
2021  1359000   1425000  1523000  1397000  1893000  2110000  2352000  2491000   

      september  october  november  december  
year                                          
2018    2591000  2847000   2722000   2541000  
2019    2882000  3119000   2939000   2789000  
2020    2161000  1854000   1242000   1353000  
2021    2489000  2697000   2511000   2258000  


In [167]:
ireland_pivot.columns.name = None
poland.columns.name = None


In [172]:
 poland = poland * 100000
# using just once  to make correct amount

In [229]:
poland = poland.astype(int)

SyntaxError: invalid syntax (<ipython-input-229-93cfa3ff815e>, line 1)

In [188]:
pip install plotly




In [178]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [186]:

# Data processing
ireland_processed_pandas = ireland_pivot.mean(axis=1)
ireland_processed_numpy = np.mean(ireland_pivot.values, axis=1)

poland_processed_pandas = poland.mean(axis=1)
poland_processed_numpy = np.mean(poland.values, axis=1)

# Data aggregation
ireland_aggregated_pandas = ireland_pivot.sum()
ireland_aggregated_numpy = np.sum(ireland_pivot.values, axis=0)

poland_aggregated_pandas = poland.sum()
poland_aggregated_numpy = np.sum(poland.values, axis=0)

# Print results
print("Ireland Dataset - Data Processing:")
print("Using Pandas:", ireland_processed_pandas)
print("Using NumPy:", ireland_processed_numpy)
print("\nPoland Dataset - Data Processing:")
print("Using Pandas:", poland_processed_pandas)
print("Using NumPy:", poland_processed_numpy)
print("\nIreland Dataset - Data Aggregation:")
print("Using Pandas:", ireland_aggregated_pandas)
print("Using NumPy:", ireland_aggregated_numpy)
print("\nPoland Dataset - Data Aggregation:")
print("Using Pandas:", poland_aggregated_pandas)
print("Using NumPy:", poland_aggregated_numpy)

# Justifications:
# Data processing:
# - The mean function is applied along the rows of the Ireland pivot table using both Pandas and NumPy to calculate
#   the average value for each row, representing data processing steps.
# - Using Pandas provides a convenient and high-level interface for data manipulation, making it easy to compute
#   row-wise averages.
# - NumPy is used for its efficient array operations, providing faster computation compared to Pandas for large datasets
#   due to its low-level implementation.
# - This helps to summarize the monthly data into a single value, potentially representing an average monthly value
#   for each observation.

# Data aggregation:
# - The sum function is applied to the Ireland pivot table to calculate the total value for each column (i.e., each month).
# - Using Pandas allows for a straightforward and intuitive way to perform column-wise aggregation operations.
# - NumPy is leveraged for its efficient array-based computations, offering optimized performance for numerical operations.
# - This helps to aggregate the monthly data into yearly totals, providing a broader perspective on the data.
# - Similar operations are performed for the Poland dataset.

Ireland Dataset - Data Processing:
Using Pandas: Year
2018    3.486414e+06
2019    4.028936e+06
2020    1.598005e+06
2021    1.623443e+06
dtype: float64
Using NumPy: [3486414.08333333 4028935.91666667 1598004.66666667 1623443.25      ]

Poland Dataset - Data Processing:
Using Pandas: year
2018    2.585750e+06
2019    2.799000e+06
2020    1.745000e+06
2021    2.042083e+06
dtype: float64
Using NumPy: [2585750.         2799000.         1745000.         2042083.33333333]

Ireland Dataset - Data Aggregation:
Using Pandas: April         8345878
August       10497519
December     11907851
February     11259870
January      11305013
July         10736138
June          9603999
March         9954371
May           9361291
November     12184837
October      12273432
September    11411376
dtype: int64
Using NumPy: [ 8345878 10497519 11907851 11259870 11305013 10736138  9603999  9954371
  9361291 12184837 12273432 11411376]

Poland Dataset - Data Aggregation:
Using Pandas: january       9369000
febr

In [194]:
import plotly.express as px


# Visualizing Ireland dataset with correct order of months
fig_ireland = px.bar(ireland_pivot.reset_index(), x='Year', y=ireland_pivot.columns,
                     title='Monthly Values in Ireland (2018-2021)',
                     labels={'value': 'Value', 'variable': 'Month'},
                     barmode='group', category_orders={'variable': months_order})
fig_ireland.show()


# Visualizing Poland dataset
fig_poland = px.bar(poland.reset_index(), x='year', y=poland.columns,
                    title='Monthly Values in Poland (2018-2021)',
                    labels={'value': 'Value', 'variable': 'Month'},
                    barmode='group')
fig_poland.show()

# Justifications:
# Plotly Express (px): Offers a high-level interface for creating interactive visualizations, making it
# easy to generate complex plots with minimal code and effort.


In [230]:
# Descriptive statistics for Ireland dataset
ireland_stats = ireland_pivot.describe()
print("Descriptive Statistics for Ireland Dataset:")
print(ireland_stats)

# Descriptive statistics for Poland dataset
poland_stats = poland.describe()
print("\nDescriptive Statistics for Poland Dataset:")
print(poland_stats)

# Justifications:
# - Using the describe() function provides summary statistics for each numerical column in the Ireland dataset.
# - This includes count, mean, standard deviation, minimum, quartiles, and maximum values, offering insights
#   into the central tendency, dispersion, and distribution of the data.
# - Understanding the descriptive statistics helps in identifying outliers, assessing data quality, and gaining
#   initial insights into the characteristics of the dataset.
# - Similarly, descriptive statistics are calculated for the Poland dataset to provide summary measures of
#   central tendency and variability.
# - This facilitates comparison between the Ireland and Poland datasets, helping to identify similarities
#   and differences in their distributions and characteristics.
# - It also assists in understanding the variability and range of values in the dataset, aiding in subsequent
#   analysis and interpretation of the data.

Descriptive Statistics for Ireland Dataset:
            January      February         March         April           May  \
count  4.000000e+00  4.000000e+00  4.000000e+00  4.000000e+00  4.000000e+00   
mean   2.826253e+06  2.814968e+06  2.488593e+06  2.086470e+06  2.340323e+06   
std    1.485753e+06  1.477983e+06  1.400737e+06  1.769204e+06  1.755399e+06   
min    6.287040e+05  6.434800e+05  8.369340e+05  2.286330e+05  3.689140e+05   
25%    2.571163e+06  2.499587e+06  1.672291e+06  7.624778e+05  1.117154e+06   
50%    3.438566e+06  3.422949e+06  2.524156e+06  2.145241e+06  2.494452e+06   
75%    3.693657e+06  3.738330e+06  3.340458e+06  3.469233e+06  3.717622e+06   
max    3.799176e+06  3.770492e+06  4.069124e+06  3.826763e+06  4.003472e+06   

               June          July        August     September       October  \
count  4.000000e+00  4.000000e+00  4.000000e+00  4.000000e+00  4.000000e+00   
mean   2.401000e+06  2.684034e+06  2.624380e+06  2.852844e+06  3.068358e+06   
std    

In [203]:
# Adding prefixes to the columns of each dataset for clarity
ireland_prefixed = ireland_pivot.add_prefix('ireland_')
poland_prefixed = poland.add_prefix('poland_')

# Merging the datasets based on the 'Year' column
merged = ireland_prefixed.merge(poland_prefixed, left_index=True, right_index=True)

# Displaying the merged dataset
print(merged)

# Justifications:
# - Adding prefixes to the columns of each dataset helps in distinguishing between variables
#   from different datasets, enhancing clarity and avoiding potential naming conflicts.
# - This ensures that columns from the Ireland dataset are easily identifiable and separate
#   from those of the Poland dataset, making it clear which dataset each variable belongs to.
# - Merging the datasets based on the 'Year' column combines the information from both datasets
#   into a single dataset, facilitating comparative analysis between Ireland and Poland.
# - The 'Year' column serves as a key to align the data from both datasets, allowing for
#   meaningful comparisons and insights into trends and patterns over time.


      ireland_January  ireland_February  ireland_March  ireland_April  \
Year                                                                    
2018          3218649           3118289        3097570        3350056   
2019          3658484           3727609        4069124        3826763   
2020          3799176           3770492        1950743         228633   
2021           628704            643480         836934         940426   

      ireland_May  ireland_June  ireland_July  ireland_August  \
Year                                                            
2018      3622338       3378751       3490513         3363274   
2019      4003472       3750724       4130610         3781395   
2020       368914        762376       1293479         1469350   
2021      1366567       1712148       1821536         1883500   

      ireland_September  ireland_October  ...  poland_march  poland_april  \
Year                                      ...                               
2018            

In [210]:
# Changing column names to lowercase for consistency
merged.columns = merged.columns.str.lower()
# Justification:
# - Changing column names to lowercase ensures consistency in the naming convention across all columns,
#   improving readability and maintainability of the dataset.
# - Consistent naming conventions make it easier to access and manipulate columns programmatically,
#   reducing the likelihood of errors and enhancing the efficiency of data analysis and visualization.

Columns ireland_january or poland_january not found in the merged dataset.
Columns ireland_february or poland_february not found in the merged dataset.
Columns ireland_march or poland_march not found in the merged dataset.
Columns ireland_april or poland_april not found in the merged dataset.
Columns ireland_may or poland_may not found in the merged dataset.
Columns ireland_june or poland_june not found in the merged dataset.
Columns ireland_july or poland_july not found in the merged dataset.
Columns ireland_august or poland_august not found in the merged dataset.
Columns ireland_september or poland_september not found in the merged dataset.
Columns ireland_october or poland_october not found in the merged dataset.
Columns ireland_november or poland_november not found in the merged dataset.
Columns ireland_december or poland_december not found in the merged dataset.


In [215]:
from scipy.stats import ttest_ind
# Justifications:
# - The t-test is a statistical test used to determine if there is a significant difference
#   between the means of two groups, in this case, the monthly values for Ireland and Poland.
# - Importing ttest_ind from scipy.stats enables us to perform an independent t-test between
#   the monthly values of Ireland and Poland for each month.
# - This helps in assessing whether there are statistically significant differences in the
#   monthly values between the two countries, providing insights into potential variations
#   in transportation trends or patterns.

# Defining the months
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
# Justifications:
# - Defining the months creates a list of month names, which will be used to access the
#   corresponding columns in the merged dataset for each month.
# - This ensures consistency in accessing the monthly data and facilitates looping through
#   each month for the t-test analysis.
# Performing t-test for each month
for month in months:
    ireland_column = f'ireland_{month.lower()}'
    poland_column = f'poland_{month.lower()}'
    t_statistic, p_value = ttest_ind(merged[ireland_column], merged[poland_column])
    print(f"T-test result for {month}:")
    print(f"T-statistic: {t_statistic}, p-value: {p_value}")
    if p_value < 0.05:
        print("Significant difference between Ireland and Poland for this month.")
    else:
        print("No significant difference between Ireland and Poland for this month.")
    print()
# Justifications:
# - The loop iterates through each month and conducts an independent t-test to compare
#   the monthly values between Ireland and Poland.
# - Printing the t-statistic and p-value for each test allows for the assessment of
#   statistical significance and interpretation of the results.
# - The conditional statements determine whether there is a significant difference
#   between Ireland and Poland for each month based on the p-value threshold of 0.05.


T-test result for January:
T-statistic: 0.5938142421129462, p-value: 0.5743255731846981
No significant difference between Ireland and Poland for this month.

T-test result for February:
T-statistic: 0.7071477954774688, p-value: 0.5059976989545489
No significant difference between Ireland and Poland for this month.

T-test result for March:
T-statistic: 0.4608438793469617, p-value: 0.6611420900299617
No significant difference between Ireland and Poland for this month.

T-test result for April:
T-statistic: 0.28757544976866406, p-value: 0.783345986746984
No significant difference between Ireland and Poland for this month.

T-test result for May:
T-statistic: 0.24752493020740815, p-value: 0.8127547297374184
No significant difference between Ireland and Poland for this month.

T-test result for June:
T-statistic: 0.23238508046931017, p-value: 0.823962663379068
No significant difference between Ireland and Poland for this month.

T-test result for July:
T-statistic: 0.3731308576874657, p-va

In [231]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Justifications:
# - Importing KMeans from sklearn.cluster allows us to perform KMeans clustering,
#   an unsupervised learning algorithm used for partitioning data into clusters.
# - Importing StandardScaler from sklearn.preprocessing enables us to standardize
#   the features by removing the mean and scaling to unit variance, which is essential
#   for KMeans clustering as it is sensitive to the scale of the features.

# Extracting the time series data from the merged dataframe
time_series_data = merged.values

# Justifications:
# - Extracting the time series data from the merged dataframe prepares the data
#   for clustering, ensuring that only the numerical values are used for analysis.

# Standardize the time series data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(time_series_data)

# Justifications:
# - Standardizing the time series data ensures that each feature (column) has a
#   mean of 0 and a standard deviation of 1, which is a requirement for many
#   machine learning algorithms, including KMeans clustering.

# Define the number of clusters
num_clusters = 3  # You can adjust this based on your requirements

# Justifications:
# - Defining the number of clusters specifies the desired number of groups
#   into which the data should be partitioned, allowing for flexibility
#   in adjusting the granularity of the clustering based on the analysis objectives.

# Perform KMeans clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

# Justifications:
# - Performing KMeans clustering assigns each data point to one of the specified
#   clusters based on their similarity, with the centroids representing the
#   center of each cluster.

# Assign the cluster labels to the merged dataframe
merged['Cluster'] = clusters

# Justifications:
# - Assigning the cluster labels to the merged dataframe allows for the visualization
#   and interpretation of the clustering results alongside the original data.

# Check the cluster assignments
print(merged.head())


      ireland_january  ireland_february  ireland_march  ireland_april  \
Year                                                                    
2018          3218649           3118289        3097570        3350056   
2019          3658484           3727609        4069124        3826763   
2020          3799176           3770492        1950743         228633   
2021           628704            643480         836934         940426   

      ireland_may  ireland_june  ireland_july  ireland_august  \
Year                                                            
2018      3622338       3378751       3490513         3363274   
2019      4003472       3750724       4130610         3781395   
2020       368914        762376       1293479         1469350   
2021      1366567       1712148       1821536         1883500   

      ireland_september  ireland_october  ...  poland_april  poland_may  \
Year                                      ...                             
2018            3487





In [220]:
# Compute cluster centroids
cluster_centroids = merged.groupby('Cluster').mean()

# Justifications:
# - Computing cluster centroids calculates the mean value of each feature
#   (column) within each cluster, representing the central tendency of the
#   data points belonging to that cluster.
# - Grouping the data by cluster and computing the mean allows for the identification
#   of characteristic patterns or behaviors associated with each cluster.

# Display the cluster centroids
print("Cluster Centroids:")
print(cluster_centroids)


Cluster Centroids:
         ireland_january  ireland_february  ireland_march  ireland_april  \
Cluster                                                                    
0              3799176.0         3770492.0      1950743.0       228633.0   
1              3438566.5         3422949.0      3583347.0      3588409.5   
2               628704.0          643480.0       836934.0       940426.0   

         ireland_may  ireland_june  ireland_july  ireland_august  \
Cluster                                                            
0           368914.0      762376.0     1293479.0       1469350.0   
1          3812905.0     3564737.5     3810561.5       3572334.5   
2          1366567.0     1712148.0     1821536.0       1883500.0   

         ireland_september  ireland_october  ...  poland_march  poland_april  \
Cluster                                      ...                               
0                1534789.0        1325561.0  ...     1775000.0      610000.0   
1                38

In [222]:
import plotly.express as px

# Justifications:
# - Importing Plotly Express (px) allows us to create interactive and
#   visually appealing plots with minimal code, facilitating the visualization
#   of complex data patterns and trends.

# Reshape the DataFrame for plotting
cluster_centroids_melted = cluster_centroids.reset_index().melt(id_vars='Cluster', var_name='Month', value_name='Passenger Amount')

# Justifications:
# - Reshaping the DataFrame for plotting transforms the data into a format
#   suitable for visualization, where each row represents a combination of
#   cluster, month, and passenger amount.

# Plot the clustered time series data
fig = px.line(cluster_centroids_melted, x='Month', y='Passenger Amount', color='Cluster', title='Clustered Time Series Data',
              labels={'Passenger Amount': 'Passenger Amount', 'Month': 'Month', 'Cluster': 'Cluster'})
fig.update_xaxes(tickangle=45)
fig.show()

# Justifications:
# - Plotting the clustered time series data using a line plot allows for the
#   visualization of trends and patterns across different clusters over time.
# - Customizing the plot by specifying labels and adjusting the x-axis tick
#   angles enhances readability and interpretability of the visualization.
# - Interactive features provided by Plotly Express enable exploration and
#   analysis of the clustered data directly within the plot.


In [232]:
# Justifications:
# - In this code snippet, we demonstrate various techniques for time series analysis, including calculating
#   the slope, rate of change, moving average, and exponential moving average (EMA) of a time series data.

# Example time series data
# Replace this with your actual time series data
time_series_data = pd.Series([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

# Calculate slope using linear regression
slope = (time_series_data.iloc[-1] - time_series_data.iloc[0]) / len(time_series_data)

# Justifications:
# - Calculating the slope using linear regression provides an indication of the trend or directionality
#   of the time series data over the specified period.
# - It measures the average rate of change of the time series data over time, helping to identify whether
#   the data is increasing, decreasing, or remaining constant.

# Calculate rate of change
rate_of_change = (time_series_data.diff() / time_series_data.shift(1)) * 100

# Justifications:
# - Computing the rate of change quantifies the percentage change between consecutive data points in the
#   time series, providing insights into the volatility or stability of the data.
# - It helps in understanding the magnitude and direction of fluctuations in the time series data over time.

# Calculate moving average
window_size = 3
moving_avg = time_series_data.rolling(window=window_size).mean()

# Justifications:
# - Calculating the moving average smoothens the time series data by averaging values over a specified
#   window size, thereby reducing noise and highlighting underlying trends or patterns.
# - It helps in identifying long-term trends or cycles in the data, making it easier to discern patterns
#   and anomalies.

# Calculate exponential moving average (EMA)
ema = time_series_data.ewm(span=window_size, adjust=False).mean()

# Justifications:
# - Computing the exponential moving average (EMA) assigns exponentially decreasing weights to previous
#   data points, giving more importance to recent observations while still considering historical data.
# - EMA is particularly useful for capturing short-term trends or responding quickly to changes in the data,
#   making it suitable for forecasting or detecting abrupt shifts in the time series.

# Print the results
print("Slope:", slope)
print("Rate of Change:")
print(rate_of_change)
print("Moving Average (window size={}):".format(window_size))
print(moving_avg)
print("Exponential Moving Average (window size={}):".format(window_size))
print(ema)


Slope: 9.0
Rate of Change:
0           NaN
1    100.000000
2     50.000000
3     33.333333
4     25.000000
5     20.000000
6     16.666667
7     14.285714
8     12.500000
9     11.111111
dtype: float64
Moving Average (window size=3):
0     NaN
1     NaN
2    20.0
3    30.0
4    40.0
5    50.0
6    60.0
7    70.0
8    80.0
9    90.0
dtype: float64
Exponential Moving Average (window size=3):
0    10.000000
1    15.000000
2    22.500000
3    31.250000
4    40.625000
5    50.312500
6    60.156250
7    70.078125
8    80.039062
9    90.019531
dtype: float64


In [233]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Justifications:
# - Importing necessary modules from scikit-learn enables us to perform machine learning tasks,
#   such as splitting the data, training a linear regression model, and evaluating model performance.

# Extracting the years from the index
years = merged.index.values.reshape(-1, 1)

# Justifications:
# - Extracting the years from the index prepares the independent variable (features) for model training,
#   allowing us to use the time information for predicting passenger amounts.

# Reshaping the passenger data to have each month as a separate feature
passenger_data = merged.drop(columns=['Cluster']).values

# Justifications:
# - Reshaping the passenger data to have each month as a separate feature prepares the dependent variable
#   (target) for model training, ensuring that the data is in the appropriate format for regression analysis.

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(years, passenger_data, test_size=0.2, random_state=42)

# Justifications:
# - Splitting the data into training and testing sets enables us to assess the performance of the model
#   on unseen data, helping to evaluate its generalization ability and potential for making accurate predictions.

# Initialize and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Justifications:
# - Initializing and fitting a linear regression model allows us to learn the relationship between
#   the independent variable (years) and the dependent variable (passenger amounts), enabling us to
#   make predictions based on this relationship.

# Predict passenger amounts for the test set
y_pred = model.predict(X_test)

# Justifications:
# - Making predictions for the test set using the trained model allows us to assess its performance
#   in estimating passenger amounts for unseen data.

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Justifications:
# - Calculating the Mean Squared Error (MSE) provides a measure of the average squared difference
#   between the predicted and actual passenger amounts in the test set.
# - MSE quantifies the performance of the regression model, with lower values indicating better
#   predictive accuracy and higher values suggesting greater discrepancy between predicted and
#   actual values.


Mean Squared Error: 1120507923422.285


In [234]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR


In [236]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Define functions for model training and evaluation
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    try:
        # Train the model
        model.fit(X_train, y_train)

        # Predict passenger amounts for the test set
        y_pred = model.predict(X_test)

        # Calculate Mean Squared Error
        mse = mean_squared_error(y_test, y_pred)
        print("Mean Squared Error:", mse)
    except Exception as e:
        print("Error occurred:", e)

# Justification:
# - The train_and_evaluate_model function is defined to encapsulate the process of training
#   and evaluating regression models. It takes a model instance and the training and testing
#   data as input, fits the model, predicts target values for the test set, calculates
#   Mean Squared Error (MSE) between the predicted and actual values, and prints the result.
# - Error handling is incorporated within the function to catch any exceptions that may occur
#   during model training or evaluation, ensuring that the code can handle unforeseen issues
#   gracefully and continue execution.

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(years, passenger_data, test_size=0.2, random_state=42)

# Justification:
# - The train_test_split function divides the data into training and testing sets, allowing
#   for model training on one subset and evaluation on another. This helps assess the model's
#   generalization performance on unseen data.

# Initialize regression models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "SVR": SVR()
}

# Justification:
# - Multiple regression models, including Linear Regression, Ridge Regression, Lasso Regression,
#   and Support Vector Regression (SVR), are initialized to compare their performance in predicting
#   passenger amounts. This enables exploration of different modeling approaches to find the most
#   suitable one for the given dataset.

# Train and evaluate each model
for name, model in models.items():
    print("Training and evaluating", name)
    train_and_evaluate_model(model, X_train, X_test, y_train, y_test)

# Justification:
# - The loop iterates over each regression model, calling the train_and_evaluate_model function
#   for each model to train, evaluate, and print the MSE. This allows for a comparative analysis
#   of the models' performance in predicting passenger amounts, aiding in model selection and
#   decision-making.


Training and evaluating Linear Regression
Mean Squared Error: 1120507923422.285
Training and evaluating Ridge Regression
Mean Squared Error: 1245769379234.805
Training and evaluating Lasso Regression
Mean Squared Error: 1120508728852.5547
Training and evaluating SVR
Error occurred: y should be a 1d array, got an array of shape (3, 24) instead.


In [237]:
https://github.com/FarhadKhankishiyev068/2023068_BIGData_CA2/blob/main/RepeatProgramming.ipynb

SyntaxError: invalid decimal literal (<ipython-input-237-c74682e2a5c7>, line 1)