# Gradient Descent Implementation 

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, accuracy_score,precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


from imblearn.over_sampling._smote.base import SMOTE
import missingno as msno
import os
import tensorflow as tf
import warnings


In [None]:

## Importing necessary libraries and packages.

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.get_logger().setLevel('ERROR')
tf.config.optimizer.set_jit(False)


In [None]:

# Visualization settings

plt.rcParams["figure.figsize"] = (10, 6)
sns.set_style("whitegrid")


In [None]:
#### Pandas Setting

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [None]:
datam= pd.read_csv('/Users/erkan/Downloads/feature_engineering/feature_engineering/datasets/report_2018-2019.csv')

In [None]:
datam.head()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(datam)

In [None]:
for col in num_cols:
    sns.boxplot(x=col, data=datam)
    plt.show()

In [None]:
datam['Country or region'].value_counts()

In [None]:
datam.drop(['Country or region', 'Overall rank'], axis=1, inplace=True)
datam.head()

In [None]:
datam = pd.get_dummies(datam, columns=['Year'], drop_first=True)
datam.head()

In [None]:
X = datam.drop('Score', axis=1)

In [None]:
y = datam['Score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
X_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
y_predict = X_b.dot(theta_best)
theta_path_bgd = []
np.random.seed(42)
theta = np.random.randn(8,1)
y_train = y_train.reshape(-1, 1)


def plot_gradient_descent(theta, eta, theta_path=None):
    m = len(X_b)
    plt.plot(X_train, y_train, "b.")
    n_iterations = 1000
    for iteration in range(n_iterations):
        if iteration < 10:
            y_predict = X_b.dot(theta)
            style = "b-" if iteration > 0 else "r--"
            plt.plot(X_train, y_predict, style)
        gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y_train)
        theta = theta - eta * gradients
        if theta_path is not None:
            theta_path.append(theta)
    plt.xlabel("$x_1$", fontsize=18)
    plt.axis([0, 2, 0, 15])
    plt.title(r"$\eta = {}$".format(eta), fontsize=16)


plt.figure(figsize=(10,4))
plt.subplot(131); plot_gradient_descent(theta, eta=0.01)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)
plt.subplot(133); plot_gradient_descent(theta, eta=1)

plt.show()


In [None]:
# Function to compute the cost (Mean Squared Error)

def compute_cost(X, y, theta):
    m = len(X)
    predictions = X.dot(theta)
    cost = (1 / (2 * m)) * np.sum((predictions - y) ** 2)
    return cost

# Gradient Descent implementation with convergence detection

def gradient_descent(X, y, theta, eta, n_iterations, tol=1e-6):
    m = len(X)
    cost_history = []  
    for iteration in range(n_iterations):
        predictions = X.dot(theta)
        gradients = (2 / m) * X.T.dot(predictions - y)
        theta = theta - eta * gradients
        
        # Save the cost at each iteration for plotting
        cost = compute_cost(X, y, theta)
        cost_history.append(cost)
        
        # Check for convergence (when the change in cost is less than tolerance)
        if iteration > 0 and abs(cost_history[-1] - cost_history[-2]) < tol:
            return theta, cost_history, iteration + 1  # Return the iteration count (1-based)
    
    return theta, cost_history, n_iterations 
# Initialize data
X_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]  # Add the bias term (1's column)
y_train = y_train.reshape(-1, 1)  # Ensure y_train is a column vector
theta = np.random.randn(X_b.shape[1], 1)  # Initialize theta randomly

# Run Gradient Descent
eta = 0.01  # Learning rate
n_iterations = 1000
theta_best, cost_history, n_iterations = gradient_descent(X_b, y_train, theta, eta, n_iterations)

# Plotting the cost function over iterations
plt.figure(figsize=(10, 6))
plt.plot(np.arange(n_iterations), cost_history, label=f"Learning rate = {eta}")
plt.xlabel("Number of iterations")
plt.ylabel("Cost (MSE)")
plt.title("Convergence of Gradient Descent")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
eta = 0.1  # Learning rate
n_iterations = 1000
theta_best, cost_history, n_iterations = gradient_descent(X_b, y_train, theta, eta, n_iterations)

# Plotting the cost function over iterations
plt.figure(figsize=(10, 6))
plt.plot(np.arange(n_iterations), cost_history, label=f"Learning rate = {eta}")
plt.xlabel("Number of iterations")
plt.ylabel("Cost (MSE)")
plt.title("Convergence of Gradient Descent")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
eta = 1  # Learning rate
n_iterations = 1000
theta_best, cost_history, n_iterations = gradient_descent(X_b, y_train, theta, eta, n_iterations)

# Plotting the cost function over iterations
plt.figure(figsize=(10, 6))
plt.plot(np.arange(n_iterations), cost_history, label=f"Learning rate = {eta}")
plt.xlabel("Number of iterations")
plt.ylabel("Cost (MSE)")
plt.title("Convergence of Gradient Descent")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Initialize data
X_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]  # Add the bias term (1's column)
y_train = y_train.reshape(-1, 1)  # Ensure y_train is a column vector

# Learning rates to test
learning_rates = [0.01, 0.1, 1]
n_iterations = 1000
results = {}

# Run Gradient Descent for each learning rate and track the number of iterations to converge
for eta in learning_rates:
    theta_initial = np.random.randn(X_b.shape[1], 1)  # Reinitialize theta for each case
    theta_best, cost_history, n_iter = gradient_descent(X_b, y_train, theta_initial, eta, n_iterations)
    results[eta] = n_iter
    print(f"Learning rate {eta} took {n_iter} iterations to converge.")
    
    # Plot the cost function over iterations
    plt.plot(np.arange(len(cost_history)), cost_history, label=f"eta = {eta}")

# Plot the cost function for different learning rates
plt.xlabel("Number of iterations")
plt.ylabel("Cost (MSE)")
plt.title("Convergence of Gradient Descent for Different Learning Rates")
plt.legend()
plt.grid(True)
plt.show()

## Learning rate of 1 looks like overshoot. 
## Learning rate of 0.1 looks like very slow convergence.
## Best learnin rate is 0.1 out of three.
