In [None]:
# pip section

!pip install pandas --quiet

In [None]:
!pip install plotly matplotlib seaborn --quiet

In [None]:
!pip install scikit-learn --quiet

# LINEAR REGRESSION AND STOCHASTIC GRADIENT DESCENT

In [None]:
# imports

from urllib.request import urlretrieve
import pandas as pd
import numpy as np

In [None]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
medical_charges_url = 'https://raw.githubusercontent.com/JovianML/opendatasets/master/data/medical-charges.csv'
urlretrieve(medical_charges_url, 'medical.csv')

In [None]:
medical_df = pd.read_csv('medical.csv')

In [None]:
# Analysis

medical_df.info()
medical_df.describe()

In [None]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
fig = px.histogram(medical_df, 
                   x='age', 
                   marginal='box', 
                   nbins=47, 
                   title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(medical_df, 
                   x='bmi', 
                   marginal='box', 
                   color_discrete_sequence=['red'], 
                   title='Distribution of BMI (Body Mass Index)')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(medical_df, 
                   x='charges', 
                   marginal='box', 
                   color='smoker', 
                   color_discrete_sequence=['green', 'grey'], 
                   title='Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
charges_sex_fig = px.histogram(medical_df, 
                   x='charges', 
                   marginal='box', 
                   color='sex', 
                   color_discrete_sequence=['blue', 'pink'], 
                   title='Charges wrt Sex')
charges_sex_fig.update_layout(bargap=0.1)
charges_sex_fig.show()

In [None]:
charges_reg_fig = px.histogram(medical_df, 
                   x='charges', 
                   marginal='box', 
                   color='region', 
                   # color_discrete_sequence=['blue', 'pink'], 
                   title='Charges wrt Region')
charges_reg_fig.update_layout(bargap=0.1)
charges_reg_fig.show()

In [None]:
medical_df.smoker.value_counts()

In [None]:
px.histogram(
    medical_df,
    x = 'smoker',
    color = 'sex',
    title = 'Smoker'
)

In [None]:
medical_df.info()

In [None]:
medical_df.children.value_counts()
px.histogram(
    medical_df,
    x = 'charges',
    color = 'children',
    title = 'Charges wrt children'
).update_layout(bargap = 0.1)

In [None]:
fig = px.scatter(medical_df, 
                 x='bmi', 
                 y='charges', 
                 color='smoker', 
                 opacity=0.8, 
                 hover_data=['sex'], 
                 title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
# Correlation

medical_df.info()
smoker_values = {"yes": 1, "no": 0}
smoker_numeric = medical_df.smoker.map(smoker_values)


In [None]:
medical_df.charges.corr(medical_df.age)

In [None]:
non_smoker_df = medical_df[medical_df.smoker == "no"]
non_smoker_df

plt.title("Charges vs. Non smokers age")
sns.scatterplot(data = non_smoker_df, x = 'age', y = 'charges', alpha = 0.7, s = 15)

In [None]:
def estimate_charges(age, w, b):
    return w * age + b

In [None]:
ages = non_smoker_df.age
estimated_charges = estimate_charges(ages, w = 50, b = 100)

plt.plot(ages, estimated_charges, 'r-o')
plt.xlabel("Age")
plt.ylabel("Charges")

In [None]:
target = non_smoker_df.charges

# plt.plot(ages, estimated_charges, 'r', alpha=0.9)
# plt.scatter(ages, target, s=8,alpha=0.8)
# plt.xlabel('Age')
# plt.ylabel('Charges')
# plt.legend(['Estimate', 'Actual'])

In [None]:
def try_params(w, b):
    ages = non_smoker_df.age
    charges = non_smoker_df.charges

    est_charges = estimate_charges(ages, w, b)

    plt.plot(ages, est_charges, 'r', alpha=0.9)
    plt.scatter(ages, target, s=8,alpha=0.8)
    plt.xlabel('Age')
    plt.ylabel('Charges')
    plt.legend(['Estimate', 'Actual'])

try_params(400, 5000)

In [None]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(predictions - targets)))

def try_params(w, b):
    ages = non_smoker_df.age
    charges = non_smoker_df.charges

    est_charges = estimate_charges(ages, w, b)

    plt.plot(ages, est_charges, 'r', alpha=0.9)
    plt.scatter(ages, target, s=8,alpha=0.8)
    plt.xlabel('Age')
    plt.ylabel('Charges')
    plt.legend(['Estimate', 'Actual'])

    loss = rmse(charges, est_charges)
    print(f"Root mean squared error is {loss}")

In [None]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(predictions - targets)))

def try_params(w, b):
    ages = non_smoker_df.age
    charges = non_smoker_df.charges

    est_charges = estimate_charges(ages, w, b)

    plt.plot(ages, est_charges, 'r', alpha=0.9)
    plt.scatter(ages, target, s=8,alpha=0.8)
    plt.xlabel('Age')
    plt.ylabel('Charges')
    plt.legend(['Estimate', 'Actual'])

    loss = rmse(charges, est_charges)
    print(f"Root mean squared error is {loss}")

In [None]:
try_params(300, 1)

## Scikit Learn 

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

help(model.fit)

In [None]:
non_smoker_df = medical_df[medical_df.smoker == 'no']

In [None]:
inputs = non_smoker_df[['age']]
targets = non_smoker_df.charges

print(inputs.shape)
print(targets.shape)

In [None]:
model.fit(inputs, targets)
model.predict(np.array([
    [23],
    [56],
    [19]
]))

In [None]:
predictions = model.predict(inputs)

rmse(targets, predictions)

In [None]:
plt.plot(inputs, targets, 'o')

In [None]:
plt.plot(inputs, predictions, 'o')

In [None]:
model.coef_, model.intercept_

In [None]:
try_params(model.coef_, model.intercept_)

### SGD

In [None]:
from sklearn.linear_model import SGDRegressor

In [None]:
sgd_model = SGDRegressor()

sgd_model.fit(inputs, targets)

In [None]:
sgd_model.predict(inputs)
try_params(sgd_model.coef_, sgd_model.intercept_)

### Self

In [None]:
smoker_df = medical_df[medical_df.smoker == 'yes']

inputs = smoker_df[['age']]
targets = smoker_df.charges

inputs.shape, targets.shape

In [None]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(predictions - targets)))

def visualise_predictions(inputs, targets, predictions):
    print(f"RMSE: {rmse(targets, predictions)}")
    plt.plot(inputs, predictions, 'r', alpha=0.9)
    plt.scatter(inputs, targets, s=8,alpha=0.8)
    plt.xlabel('Age')
    plt.ylabel('Charges')
    plt.legend(['Estimate', 'Actual'])

In [None]:
# Training using LR

model.fit(inputs, targets)

predictions = model.predict(inputs)

In [None]:
visualise_predictions(inputs, targets, predictions)

In [None]:
inputs, targets

In [None]:
# Training SGD

sgd_model = SGDRegressor()

sgd_model.fit(inputs, targets)
predictions = sgd_model.predict(inputs)
print(sgd_model.coef_, sgd_model.intercept_)

visualise_predictions(inputs, targets, predictions)

**ML Training**: Process of finding the most apt params so as to model a relnship b/w the inputs and targets

3 components:
- Model
- Loss function
- Optimizer

### More params

In [None]:
inputs, targets = non_smoker_df[['age', 'bmi']], non_smoker_df.charges

inputs.shape, targets.shape

In [None]:
lr_model = LinearRegression()

lr_model.fit(inputs, targets)

predictions = lr_model.predict(inputs)

rmse(targets, predictions)

In [None]:
# For 3D Scatter

# fig = px.scatter_3d(non_smoker_df, x='age', y='bmi', z='charges')
# fig.update_traces(marker_size=3, marker_opacity=0.5)
# fig.show()

- You can't find a relationship that doesn't exist, no matter what machine learning technique or optimization algorithm you apply

In [None]:
inputs = non_smoker_df[['bmi']]
targets = non_smoker_df.charges

lr_model = LinearRegression().fit(inputs, targets)

predictions = lr_model.predict(inputs)

visualise_predictions(inputs, targets, predictions)

In [None]:
fig = px.strip(non_smoker_df, x='children', y='charges', title= "Children vs. Charges")
fig.update_traces(marker_size=4, marker_opacity=0.7)
fig.show()

In [None]:
non_smoker_df.charges.corr(non_smoker_df.children)

### Categorical Features

In [None]:
smoker_codes = {'yes': 1, 'no': 0}
sex_codes = {"male": 0, "female": 1}

medical_df['smoker_code'] = medical_df.smoker.map(smoker_codes)
medical_df['sex_code'] = medical_df.sex.map(sex_codes)
# medical_df.charges.corr(medical_df.smoker_code)

In [None]:
from sklearn import preprocessing

encoder = preprocessing.OneHotEncoder()
encoder.fit(medical_df[['region']])
encoder.categories_

In [None]:
region_codes = encoder.transform(medical_df[['region']]).toarray()
medical_df[['northeast', 'northwest', 'southeast', 'southwest']] = region_codes

In [None]:
medical_df

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr_model = LinearRegression()

input_cols = ['age', 'sex_code', 'bmi', 'children', 'smoker_code', 'northeast', 'northwest', 'southeast', 'southwest']
inputs = medical_df[input_cols]
targets = medical_df[['charges']]

lr_model.fit(inputs, targets)

lr_model.coef_

In [None]:
weights_df = pd.DataFrame({
    'fields': np.append(input_cols, 1),
    'weights': np.append(lr_model.coef_, lr_model.intercept_)
})

weights_df

In [None]:
scaler = preprocessing.StandardScaler()

numeric_cols = ['age', 'bmi', 'children']
scaler.fit(medical_df[numeric_cols])

scaled_data = scaler.transform(medical_df[numeric_cols])
scaled_data


In [None]:
medical_df.info()

In [None]:
cat_cols = ['smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
cat_data = medical_df[cat_cols]

In [None]:
# After processsing, that is, scaling and categorising inputs, we have

inputs = np.concatenate((scaled_data, cat_data), axis = 1)
targets = medical_df['charges']

lr_model.fit(inputs, targets)

In [None]:
weight_analysis_df = pd.DataFrame({
    'field': np.append(numeric_cols + cat_cols, 1),
    'weight': np.append(lr_model.coef_, lr_model.intercept_)
})

weight_analysis_df

The most important factors thus are smoker, age, bmi

### Test Set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
medical_df = pd.read_csv("medical.csv")

inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size=0.1)

targets_test

### Ocean Data

In [1]:
from urllib.request import urlretrieve
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [None]:
ocean_df  = pd.read_csv(r'./datasets/bottle.csv')
ocean_df

In [8]:
ocean_df.describe()

Unnamed: 0,Cst_Cnt,Btl_Cnt,Depthm,T_degC,Salnty,O2ml_L,STheta,O2Sat,Oxy_µmol/Kg,BtlNum,...,R_CHLA,R_PHAEO,R_PRES,R_SAMP,DIC1,DIC2,TA1,TA2,pH2,pH1
count,864863.0,864863.0,864863.0,853900.0,817509.0,696201.0,812174.0,661274.0,661268.0,118667.0,...,225276.0,225275.0,864863.0,122006.0,1999.0,224.0,2084.0,234.0,10.0,84.0
mean,17138.790958,432432.0,226.831951,10.799677,33.84035,3.392468,25.819394,57.103779,148.808694,10.497426,...,0.450225,0.198599,228.395694,162.071521,2153.239714,2168.14833,2256.055845,2278.858803,7.94857,7.910983
std,10240.949817,249664.587269,316.050259,4.243825,0.461843,2.073256,1.167787,37.094137,90.187533,6.189688,...,1.208566,0.376539,319.456731,85.722796,112.995202,154.852332,34.844435,58.496495,0.021216,0.077666
min,1.0,1.0,0.0,1.44,28.431,-0.01,20.934,-0.1,-0.4349,0.0,...,-0.01,-3.89,0.0,0.0,1948.85,1969.44,2181.57,2198.15,7.9231,7.6183
25%,8269.0,216216.5,46.0,7.68,33.488,1.36,24.965,21.1,60.91547,5.0,...,0.05,0.05,46.0,200.0,2028.33,2008.9775,2230.3225,2229.0625,7.931475,7.898675
50%,16848.0,432432.0,125.0,10.06,33.863,3.44,25.996,54.4,151.06415,10.0,...,0.16,0.11,126.0,206.0,2170.64,2265.885,2244.325,2247.505,7.94665,7.92885
75%,26557.0,648647.5,300.0,13.88,34.1969,5.5,26.646,97.6,240.3796,16.0,...,0.39,0.23,302.0,214.0,2253.81,2315.525,2278.505,2316.4525,7.9633,7.9551
max,34404.0,864863.0,5351.0,31.14,37.034,11.13,250.784,214.1,485.7018,25.0,...,66.11,65.3,5458.0,424.0,2367.8,2364.42,2434.9,2437.0,7.9883,8.0477


In [21]:
# For temp prediction

ocean_df = ocean_df[['Depthm', 'Salnty', 'T_degC']]
ocean_df.head()

Unnamed: 0,Depthm,Salnty,T_degC
0,0,33.44,10.5
1,8,33.44,10.46
2,10,33.437,10.46
3,19,33.42,10.45
4,20,33.421,10.45


In [None]:
px.histogram(
    ocean_df,
    x = 'T_degC',
    title = "Temp Analysis"
).update_layout(bargap = 0.1)

In [None]:
px.histogram(
    ocean_df,
    x = 'T_degC',
    color = 'Depthm',
    title = "Temp Analysis"
).update_layout(bargap = 0.1)

In [22]:
temp_wrt_depth = ocean_df.T_degC.corr(ocean_df.Depthm)
temp_wrt_salinity = ocean_df.T_degC.corr(ocean_df.Salnty)

print(temp_wrt_depth, temp_wrt_salinity)

-0.6812014451784366 -0.5052659661915732


Both Depth and Salinity are **strongly inversely proportional** to the temperature

Cleaning of data to remove **NULL** values

In [24]:
imputer = SimpleImputer()

imputed_arr = imputer.fit_transform(ocean_df)

ocean_df = pd.DataFrame(imputed_arr, columns = ocean_df.columns)
ocean_df.head()

Unnamed: 0,Depthm,Salnty,T_degC
0,0.0,33.44,10.5
1,8.0,33.44,10.46
2,10.0,33.437,10.46
3,19.0,33.42,10.45
4,20.0,33.421,10.45


In [None]:
# For checking if the null values are removed

# test_df = pd.DataFrame(ocean_df, columns = ['Depthm', 'Salnty', 'T_degC'])
# test_df[['T_degC']].isnull().sum()

In [25]:
scaler = preprocessing.StandardScaler()

numeric_cols = ['Depthm', 'Salnty']
scaler.fit(ocean_df[numeric_cols])

scaled_data = scaler.transform(ocean_df[numeric_cols])
inputs = scaled_data
targets = ocean_df[['T_degC']]

In [26]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size = 0.2)

inputs_train.shape, inputs_test.shape

((691890, 2), (172973, 2))

In [28]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(predictions - targets)))

In [37]:
lr_model = LinearRegression()

lr_model.fit(inputs_train, targets_train)
# lr_model.coef_, lr_model.intercept_

predict_training = lr_model.predict(inputs_train)
# print(rmse(predict_training, targets_train))

print(predict_training.shape, inputs_train.shape)

(691890, 1) (691890, 2)


In [32]:
predict_test = lr_model.predict(inputs_test)
print(rmse(predict_test, targets_test))

3.041628759513971
