In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

## TASK1
X = [7, 12, 5, 18, 5, 9, 13, 12, 19, 7, 12, 12, 13, 3, 4, 5, 13, 8, 7, 6] 
X = sorted(X)

print(f'Mean: {np.mean(X)}')
print(f'Sorted list: {X}')
print(f'Mean: {np.median(X)}')
print(f'Standard deviation: {round(np.std(X), 2)}')

# Convert to pandas DataFrame
df = pd.DataFrame(X, columns=['X'])
boxplot = df.boxplot(column=['X'])


# Find outliers by using IQR
# IQR tells how spread the middle values are.
# It can be used to tell when a value is too far from the middle.

# An outlier is a point which falls more than 1.5 times the interquartile range above the third quartile 
# or below the first quartile.

# Steps:
# Arrange the data in increasing order
print(f'Sorted list in increasing order : {X}')
# Calculate first(q1) and third quartile(q3)
q1, q3 = np.percentile(X,[25,75])
print(f'First quartile: {q1}\nThird quartile: {q3}')
# Find interquartile range (q3-q1)
iqr = q3 - q1
print(f'IQR: {iqr}')
# Find lower bound q1*1.5
lower_bound = q1 -(1.5 * iqr) 
# Find upper bound q3*1.5
upper_bound = q3 +(1.5 * iqr) 
print(f'Lower bound: {lower_bound}\nUpper bound: {upper_bound}')
# Anything that lies outside of lower and upper bound is an outlier

Mean: 9.5
Sorted list: [3, 4, 5, 5, 5, 6, 7, 7, 7, 8, 9, 12, 12, 12, 12, 13, 13, 13, 18, 19]
Mean: 8.5
Standard deviation: 4.42
Sorted list in increasing order : [3, 4, 5, 5, 5, 6, 7, 7, 7, 8, 9, 12, 12, 12, 12, 13, 13, 13, 18, 19]
First quartile: 5.75
Third quartile: 12.25
IQR: 6.5
Lower bound: -4.0
Upper bound: 22.0


In [6]:
X = [1, 4, 3, 5]
y = [5,2.75, 3, 2.5]

df = pd.DataFrame(
    {'X': X,
     'y': y}
)

print(df,'\n')

# Calculate the mean of X and y
xmean = np.mean(X)
ymean = np.mean(y)

print(f'The mean of X: {xmean}')
print(f'The mean of y: {ymean}')

# Calculate the terms needed for the numator and denominator of beta
df['xycov'] = (df['X'] - xmean) * (df['y'] - ymean)
df['xvar'] = (df['X'] - xmean)**2
df['yvar'] = (df['y'] - ymean)**2


# Calculate beta and alpha
beta = round(df['xycov'].sum() / df['xvar'].sum(), 2)
alpha = round(ymean - (beta * xmean), 2)
print(f'alpha = {alpha}')
print(f'beta = {beta}')

ypred = alpha + beta * df['X']
print(list(ypred))

   X     y
0  1  5.00
1  4  2.75
2  3  3.00
3  5  2.50 

The mean of X: 3.25
The mean of y: 3.3125
alpha = 5.39
beta = -0.64
[4.75, 2.8299999999999996, 3.4699999999999998, 2.1899999999999995]


In [29]:

r = round(df['xycov'].sum() /  np.sqrt(df['xvar'].sum()* df['yvar'].sum()), 2)
print(f' Correlation coefficient r: {r}')

 Correlation coefficient r: -0.95


In [15]:
# Non Linear regression: Square root curve
df['sqrtx'] = np.sqrt(df['X'])
print(df,'\n')
xmean = np.mean(df['sqrtx'])
ymean = np.mean(y)

print(f'The mean of X: {xmean}')
print(f'The mean of y: {ymean}')


df['xycov'] = (df['sqrtx'] - xmean) * (df['y'] - ymean)
df['xvar'] = (df['sqrtx'] - xmean)**2
df['yvar'] = (df['y'] - ymean)**2


# Calculate beta and alpha
beta = round(df['xycov'].sum() / df['xvar'].sum(), 2)
alpha = round(ymean - (beta * xmean), 2)
print(f'alpha = {alpha}')
print(f'beta = {beta}')

r = round(df['xycov'].sum() /  np.sqrt(df['xvar'].sum()* df['yvar'].sum()), 2)
print(f'Correlation coefficient r: {r}')

   X     y     xycov      xvar      yvar     sqrtx
0  1  5.00 -1.252175  0.550608  2.847656  1.000000
1  4  2.75 -0.145108  0.066549  0.316406  2.000000
2  3  3.00  0.003118  0.000100  0.097656  1.732051
3  5  2.50 -0.401406  0.244074  0.660156  2.236068 

The mean of X: 1.7420296962671666
The mean of y: 3.3125
alpha = 6.94
beta = -2.08
Correlation coefficient r: -0.98
