# Deep Learning Homework 2
Covariance, correlation, data handling and gradient descent

In [1]:
!lscpu | grep 'Model name'
!lscpu | grep 'Core(s) per socket:'
!lscpu | grep 'Thread(s) per core'

zsh:1: command not found: lscpu
zsh:1: command not found: lscpu
zsh:1: command not found: lscpu


## Import modules

In [2]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
#!!pip install miceforest
from miceforest import ImputationKernel

ModuleNotFoundError: No module named 'miceforest'

## Read data from csv file

### Read advertising csv

In [None]:
df_advertising = pd.read_csv('Advertising.csv')
df_advertising

In [None]:
df_advertising.info()

### Read articulos csv

In [None]:
df_papers = pd.read_csv('articulos_ml.csv')
df_papers

In [None]:
df_papers.info()

## Data imputation and cleaning data

### Remove row in Advertising dataset

In [None]:
df_advertising = df_advertising[df_advertising.columns[1::]]
df_advertising

### Fill missing values with Multiple Imputation on Articulos dataset
This is a multivariate imputation technique, meaning that the missing information is filled by taking into consideration the information from the other columns.

In [None]:
mice_kernel = ImputationKernel(data = df_papers[df_papers.columns[2::]],
                               save_all_iterations = True,
                               random_state = 2023)

In [None]:
mice_kernel.mice(2)
mice_imputation_df_papers = mice_kernel.complete_data()
mice_imputation_df_papers

## See how data is related

In [None]:
def plotCaracteristicas(data):
    col_list = data.columns
    title = "Relation between %s and %s"
    plt.figure(figsize=(10, 14))
    i = 0
    for col in col_list:
        i+=1
        plt.subplot(7,2,i)
        plt.plot(data[col],data[col_list[-1]],marker='.',linestyle='none')
        plt.title(title % (col, col_list[-1]))   
        plt.tight_layout()

We expect that correlation is bigger between "TV" and "Sales"

In [None]:
plotCaracteristicas(df_advertising)

We expect that correlation is bigger between "Word count" and "# Shares"

In [None]:
plotCaracteristicas(mice_imputation_df_papers)

## Covariance and Pearson's correlation

### Defining functions for covariance and correlation

In [None]:
def cov(x, y):
  x = np.array(x); y = np.array(y)
  COV = 0
  for i in range(len(x)):
    COV += (x[i] - np.mean(x))*(y[i] - np.mean(y))
  return COV/len(x)

In [None]:
def cov_matrix(df):
  M_cov = []
  for x in df.columns:
    m_cov = []
    for y in df.columns:
      m_cov.append(cov(df[x], df[y]))
    M_cov.append(m_cov)
  return np.array(M_cov)

In [None]:
def corr_matrix(df):
  M_corr = []
  for x in df.columns:
    m_corr = []
    for y in df.columns:
      m_corr.append(cov(df[x], df[y])/math.sqrt(cov(df[x], df[x])*cov(df[y], df[y])))
    M_corr.append(m_corr)
  return np.array(M_corr)

## Covariance

### Covariance for Advertising

In [None]:
np.set_printoptions(precision=2)
M_cov_advertising = cov_matrix(df_advertising)

df_advertising.columns, M_cov_advertising

### Covariance for Articulos

In [None]:
np.set_printoptions(precision=2)
M_cov_articulos = cov_matrix(mice_imputation_df_papers)
mice_imputation_df_papers.columns, M_cov_articulos

In [None]:
np.set_printoptions(precision=8)

## Pearson's correlation

### Correlation for Advertising
It turns out that "TV" and "Sales" has a bigger correlation

In [None]:
M_corr_advertising = corr_matrix(df_advertising)
df_advertising.columns, M_corr_advertising

In [None]:
x,y = 'TV', 'Sales'
title = "Relation between %s and %s"
fig = plt.figure(figsize=(2, 2))
plt.plot(df_advertising[x],df_advertising[y],marker='.',linestyle='none')
plt.tight_layout()
plt.title(title % (x, y))  

### Correlation for Articulos
But here our hypothesis was grong, turns out that "# of comments" and "# Shares" has a bigger correlation

In [None]:
M_corr_articulos = corr_matrix(mice_imputation_df_papers)
mice_imputation_df_papers.columns, M_corr_articulos

In [None]:
x,y = '# of comments', '# Shares'
#x,y = 'Word count', '# Shares'
title = "Relation between %s and %s"
fig = plt.figure(figsize=(2.5, 2))
plt.plot(mice_imputation_df_papers[x],mice_imputation_df_papers[y],marker='.',linestyle='none')
plt.tight_layout()
plt.title(title % (x, y))  

## Regression by Gradicent descent

### Defining gradient descedent calculations function

In [None]:
def GD(trX, trY, b_0, b_1, learningRate, num_steps):
    criteria = 1e-8
    for step in range(0, num_steps+1):
        b_0_gradient = 0
        b_1_gradient = 0
        N = float(len(trX))
        for i in range(0, len(trX)):
            b_0_gradient -= (2/N) * (trY[i] - (b_0 + b_1 * trX[i]))
            b_1_gradient -= (2/N) * (trY[i] - (b_0 + b_1 * trX[i])) * trX[i]

        b_0 = b_0 - (learningRate * b_0_gradient)
        b_1 = b_1 - (learningRate * b_1_gradient)
        if max(abs(learningRate * b_0_gradient), abs(learningRate * b_1_gradient)) < criteria:
            break

    # Results
    print("theta_0 and theta_1 are:", b_0, b_1, "in", step, 'epochs')
    return b_0, b_1

### Defining auxiliar plot function

In [None]:
def plot(X,Y, theta_0, theta_1):
    plt.scatter(X,Y, label='Y - data')
    h = theta_0 + theta_1 * X
    plt.plot(X,h, 'r', label='h - model line')
    plt.legend(loc="upper left")

### Regression for Advertising dataset with the most correlative variables

In [None]:
trX, trY = np.array(df_advertising['TV']), np.array(df_advertising['Sales'])
# Defining initial parameters
epochs = 10000
alpha = .000001
theta_0 = 0
theta_1 = 10

theta_0, theta_1 = GD(trX, trY, theta_0, theta_1, alpha, epochs)
plot(trX, trY, theta_0, theta_1)

### Regression for Articles dataset with the most correlative variables

In [None]:
#trX, trY = np.array(df_advertising['TV']), np.array(df_advertising['Sales'])
x,y = '# of comments', '# Shares'
trX, trY = mice_imputation_df_papers[x],mice_imputation_df_papers[y]
# Defining initial parameters
epochs = 10000
alpha = .000001
theta_0 = 0
theta_1 = 0

theta_0, theta_1 = GD(trX, trY, theta_0, theta_1, alpha, epochs)
plot(trX, trY, theta_0, theta_1)

In [None]:
#trX, trY = np.array(df_advertising['TV']), np.array(df_advertising['Sales'])
x,y = 'Word count', '# Shares'
trX, trY = mice_imputation_df_papers[x],mice_imputation_df_papers[y]
# Defining initial parameters
epochs = 10000
alpha = .00000001
theta_0 = 0
theta_1 = 0

theta_0, theta_1 = GD(trX, trY, theta_0, theta_1, alpha, epochs)
plot(trX, trY, theta_0, theta_1)