In [None]:
# For data manipulation
import pandas as pd

# for scientific computation
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm

# for diagramming 
import matplotlib.pyplot as plt
import seaborn as sns

# For serialization and deserialization of data from/to file
import pickle

## Stage 2: Data Preparation
### 1. Collect and load relevant data from various sources

In [None]:
# read the data
# Andreas data
df_inflation = pd.read_csv("./data/data-exam/US_inflation_rates.csv")

In [None]:
df_inflation.head()

In [None]:
df_inflation.shape

In [None]:
list(df_inflation)

In [None]:
df_inflation

In [None]:
df_inflation.info()

### 2. Clean and integrate the collected data in approriate data structures. 
Apply any transformations needed for the integration and the operations - ETL (Extract Transform Load) or (Extract Load Transform).

In [None]:
df_inflation.isnull().sum()

In [None]:
sns.heatmap(df_inflation.isnull(), yticklabels = False, cbar = False, cmap = 'viridis')

### 3. Explore the data by applying measures from statistics to discover it's basic features.
Create charts and diagrams to visualize the features for better understanding and support of furhter decisions.

In [None]:
df_inflation.describe()

In [None]:
df_inflation.sample(5)

In [None]:
# Convert the 'date' column to datetime if it's not already
df_inflation['date'] = pd.to_datetime(df_inflation['date'])

# Extract the year from the 'date' column
df_inflation['year'] = df_inflation['date'].dt.year

In [None]:
# plot the cleaned dataframe from the US Inflation dataset
plt.xlabel('year')
plt.ylabel('value')
plt.scatter(df_inflation.date, df_inflation.value, color='red')
plt.title('US Inflation Rate Over Time (Every 12th year)')
# Set the x-axis ticks to show only every year
plt.xticks(rotation=45)  # Rotate x-axis labels for readability
plt.gca().xaxis.set_major_locator(plt.MaxNLocator(integer=True))  # Show only integer values on the x-axis
plt.show()


In [None]:
df_inflation.head()

In [None]:
# Filter the DataFrame to include only the first 12 years of data
#start_year = min(df_inflation.date.dt.year)
#end_year = start_year + 12
#filtered_df = df_inflation[(df_inflation.date.dt.year >= start_year) & (df_inflation.date.dt.year <= end_year)]

# Create the scatter plot with the filtered data
#plt.xlabel('Year')
#plt.ylabel('Inflation Value')
#plt.scatter(filtered_df.date, filtered_df.value, color='red')
#plt.title('US Inflation Rate Over the First 12 Years (1947-1960)')
#plt.xticks(rotation=45)  # Rotate x-axis labels for readability
#plt.gca().xaxis.set_major_locator(plt.MaxNLocator(integer=True))  # Show only integer values on the x-axis
#plt.show()

In [None]:
# Create a histogram of the years
#sns.histplot(filtered_df['year'], label='year')
sns.distplot(df_inflation['year'], label='year', norm_hist=True)

In [None]:
# Create a histogram of the years
sns.distplot(df_inflation['value'], label='value', norm_hist=True)

### 4. Apply the necessary pre-processing to prepare the data for machine learning analysis, ensuring that the data is: 
#### a. meaningful – describes relevant and correctly measured features and observations

#### b. sufficient – describes various cases and feature occurrences, decided by testing
#### c. shaped – presented in a structure, appropriate for processing by machine learning algorithms
#### d. cleaned – repaired from missing values and outliers
- DONE
#### e. scaled – transform data distributions in comparable scales, when necessary
- DONE
#### f. engineered – analyse all features and select the most informative for further processing

## Stage 3: Solution Prototype

Objective: Using data and analysis for building predictive models

Extend the data analysis by implementing machine learning and deep learning methods and algorithms.


1. Select relevant methods that could solve the problem. Train, test and validate data models by use of supervised and unsupervised methods, neural networks or graphs.

### Train the data model

#### Split the Data in Dependent y and Independent X Data Sets

In [None]:
X = df_inflation['year'].values.reshape(-1, 1)

In [None]:
y = df_inflation['value'].values.reshape(-1, 1)

In [None]:
plt.xlabel('year')
plt.ylabel('value')
plt.scatter(X, y, color = 'red')
plt.show()

In [None]:
df_inflation.head()

#### Split the Data in Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.2)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
y_test

In [None]:
myreg = LinearRegression()

In [None]:
myreg.fit(X_train, y_train)
myreg

In [None]:
a = myreg.coef_
b = myreg.intercept_

In [None]:
a

In [None]:
b

In [None]:
y_predicted = myreg.predict(X_test)
y_predicted

In [None]:
# Visualize the Linear Regression
plt.title('Linear Regression')
plt.scatter(X, y, color='green')
plt.plot(X_train, a*X_train + b, color='blue')
plt.plot(X_test, y_predicted, color='orange')
plt.xlabel('date by year')
plt.ylabel('value')
plt.show()

In [None]:
R2 = myreg.score(X, y)
R2

### Polytfit
#### Split the Data in Dependent y and Independent X Data Sets

In [None]:
X, y = df_inflation.year, df_inflation.value

#### Split the Data in Training and Testing Sets

In [None]:
# split the set into subsets for training and testing
from sklearn.model_selection import train_test_split

# default proportion is 75:25
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2) 

In [None]:
# the shape of the subsets
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
y_train

In [None]:
df_inflation.head()

#### Train data model

In [None]:
# build a model from the train data with method 'polyfit'
model = np.polyfit(X_train, y_train, 1)

In [None]:
model

In [None]:
a, b = model[0], model[1]

In [None]:
test = np.polyfit(X_test, y_test, 1)
test

In [None]:
a1, b1 = test[0], test[1]

In [None]:
# Visualise the Linear Regression 
plt.title('Linear Regression')
plt.scatter(X, y, color='green')
plt.plot(X_test, a1*X_test + b1, color='orange')
plt.plot(X_train, a*X_train + b, color='blue')

plt.xlabel('date')
plt.ylabel('value')
plt.show()

### Test the models

#### Test with Known Data

In [None]:
date_predicted = myreg.predict([[2000]])
date_predicted

In [None]:
date_predict = a*2000 + b
date_predict

### Predict with unknown Data

In [None]:
value = 400
date_predicted = myreg.predict([[value]])
date_predicted

In [None]:
df_inflation.head()