# This is regression Project on Currency conversion rate

In [44]:
# Importing necessary libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt

In [45]:
# Loading data

df = pd.read_csv('rand-dollar.csv', index_col=0)
df.head()

Unnamed: 0,ZAR/USD,Value of Exports (USD),Value of Exports (ZAR),Value of Imports (USD),Value of Imports (ZAR),IMF Reserve Position (USD),Foreign Exchange (USD),Claims on Non-residents (USD),Liabilities to Non-residents (USD),Savings Rate,Lending Rate,Government Bonds,"Financial Market Prices, Equities Index",Consumer Price Index
2008M01,7.01,5611.9,39356.82,8105.11,56841.94,1.92,29526.78,51547.61,37752.29,4.53,14.5,8.36,312.97,85.48
2008M02,7.66,6126.16,46946.33,8159.43,62527.78,2.01,29943.04,59702.31,45927.67,5.61,14.5,8.69,335.09,85.91
2008M03,7.97,6417.48,51150.89,8010.87,63851.12,2.05,30246.54,56448.18,42565.17,5.62,14.5,9.16,348.84,87.04
2008M04,7.79,7215.71,56174.34,9112.97,70944.49,2.03,30399.21,55534.75,43342.7,5.8,15.0,9.15,358.47,87.61
2008M05,7.62,7379.67,56240.45,9090.14,69275.97,2.1,30467.81,57682.89,46533.13,6.5,15.0,9.51,371.98,88.32


#### We will train the data to predict ZAR/USD.

## 1. Exploratory Data Analysis

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 2008M01 to 2017M12
Data columns (total 14 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   ZAR/USD                                  120 non-null    float64
 1   Value of Exports (USD)                   120 non-null    float64
 2   Value of Exports (ZAR)                   120 non-null    float64
 3   Value of Imports (USD)                   120 non-null    float64
 4   Value of Imports (ZAR)                   120 non-null    float64
 5   IMF Reserve Position (USD)               120 non-null    float64
 6   Foreign Exchange (USD)                   120 non-null    float64
 7   Claims on Non-residents (USD)            120 non-null    float64
 8   Liabilities to Non-residents (USD)       120 non-null    float64
 9   Savings Rate                             120 non-null    float64
 10  Lending Rate                             120 

#### From the above output, we can see that there are no missing value since all columns have the required number of entries with correct data types

In [47]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ZAR/USD,120.0,10.083667,2.651753,6.74,7.66,9.82,12.34,16.37
Value of Exports (USD),120.0,7309.426167,1270.955437,3662.05,6505.7075,7538.735,8074.45,10142.41
Value of Exports (ZAR),120.0,72584.39125,18333.43841,36251.75,59877.94,71426.1,87488.5325,116185.99
Value of Imports (USD),120.0,7847.494083,1261.457881,4725.01,6916.105,7802.22,8922.6675,10589.92
Value of Imports (ZAR),120.0,77969.693,18870.625678,43062.14,61254.9325,80262.36,95364.195,115210.99
IMF Reserve Position (USD),120.0,180.203667,241.856486,1.92,2.3275,65.17,193.7075,669.17
Foreign Exchange (USD),120.0,37743.736167,4044.555006,29526.78,35048.8725,39067.49,41035.3875,42735.0
Claims on Non-residents (USD),120.0,50059.19075,4597.734275,40520.11,47196.515,50127.57,53244.93,59702.31
Liabilities to Non-residents (USD),120.0,39189.093917,4051.000202,31683.49,36310.975,38698.145,41990.3775,48338.35
Savings Rate,120.0,3.63725,1.295459,2.4,2.7,3.08,4.245,7.31


## 2. Data Preprocessing

In [48]:
# Splitting data into features and target

x = df.drop(columns='ZAR/USD', axis = 1)
y = df['ZAR/USD']

In [49]:
# Feature scaling (Standardization)

scaler = StandardScaler()
X = scaler.fit_transform(x)

In [50]:
# Splitting into train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False)


## 3. Training and evaluating model

In [51]:
# Instantiate model and train model

lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict the target using X_test

y_pred = lr.predict(X_test)

# Evaluation of model using Mean Squared Error

mserror = mean_squared_error(y_test, y_pred)
R2score = r2_score(y_test, y_pred)
mserror, R2score

(1.2766295190540433, -0.4671516405579963)

#### The R-Squared value being negative implies that the model thus linear regression is worse at predicting the target variable than if a constant line y = intercept was used

#### We will then resort to other regression and lookout for the one that will give a better prediction thus a highe positive R-squared and a lower Mean Square Error (MSE).
#### we will consider the following regressions: Decision Tree, Random Forest

### Decision Tree

In [56]:
from sklearn.tree import DecisionTreeRegressor

dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train, y_train)

# Predict and Evaluate the Model
y_pred = dt_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Decision Tree Mean Squared Error: {mse}')
print(f'Decision Tree R^2 Score: {r2}')

Decision Tree Mean Squared Error: 1.2273291666666672
Decision Tree R^2 Score: -0.4104937834383977


#### The Decision Tree also gives a negative R-Squared meaning it is not a good model for predicting the target

### Random Forest

In [57]:
from sklearn.ensemble import RandomForestRegressor

# Train the Random Forest Model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predict and Evaluate the Model
y_pred_rf = rf_regressor.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest Mean Squared Error: {mse_rf}')
print(f'Random Forest R^2 Score: {r2_rf}')

Random Forest Mean Squared Error: 0.6393976387500092
Random Forest R^2 Score: 0.2651796933568531


#### With a Mean Squared Error of 0.64 and a positive R-Squared of 0.27, Random forest came up as the best model to train the dataset. 
#### We will conduct a hyperparameter tuning

### Hyperparameter Tuning

In [63]:
from sklearn.model_selection import GridSearchCV

# Define the model
rf = RandomForestRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200,300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [None, 1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2'] 
}

# Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best Parameters and Model Evaluation
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Random Forest Mean Squared Error: {mse_rf}')
print(f'Random Forest R^2 Score: {r2_rf}')

560 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hemed\.conda\envs\Python\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hemed\.conda\envs\Python\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\hemed\.conda\envs\Python\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\hemed\.conda\envs\Python\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_paramete

Best Parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 300}
Random Forest Mean Squared Error: 0.6750385125463283
Random Forest R^2 Score: 0.22421983328724093


#### The initial Random Forest performed better than the best of the hyperparameter tuning.

## Conclusion: The best model for training this dataset is Random Forest with 100 n_estimators and default parameters.