In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [None]:
#Reading the database
df = pd.read_csv('bottle.csv')

#Taking only the selected two features from the dataset
df_bin = df[['Salnty', 'T_degC']]

#Rename the columns
df_bin.columns = ['Sal', 'Temp']

#Display the first 5 rows
df_bin.head()


In [None]:
#Ploting the Scatter points and the relation between the features
# ci - confidence value by default = 95%
sns.lmplot(x = 'Sal', y='Temp', data=df_bin, order=2, ci=None)
plt.show()



In [None]:
#Eliminating NaN or missing input numbers
# DATA CLEANING 
df_bin.fillna(method='fill', inplace=True)

# TRAINNING THE MODEL
#Separation between dependent and independent data
X = np.array(df_bin['Sal']).reshape(-1, 1)
Y = np.array(df_bin['Temp']).reshape(-1, 1)

#Dropping the rows with NaN
df_bin.dropna(inplace=True)

#Splitting the data into trainning & testing sets
X_train, Y_train, X_test, Y_test = train_test_split(X, Y, test_size=.25)

#init the Regression obj
regr = LinearRegression()

regr.fit(X_train, Y_train)
print(regr.score(X_test, Y_test))
# the score on the whole data set not very good ~ 0.207

In [None]:
# Plotting the result of prediction

Y_pred = regr.predict(X_test)
plt.scatter(X_test, Y_test, color='bo')
plt.plot(X_test, Y_pred, color='g')
plt.show()

#The data in this case suggest - through the small score - that the Liniear Regression may not be fitted 
#Although that, LinReg may fit the data if we sample just a portion of it

df_bin500 = df_bin[:][:500]

sns.lmplot(x='Sal', y='Temp', ci=None, data=df_bin500, order=2)



In [None]:
# Recomputing the Liniar Regression only for these 500 entries
df_bin500.fillna(method='fill', inplace=True)

X = np.array(df_bin500['Sal']).reshape(-1, 1)
Y = np.array(df_bin500['Temp']).reshape(-1, 1)

df_bin500.dropna(inplace=True)

X_train, Y_train, X_test, Y_test = train_test_split(X, Y, test_size=.25)

regr = LinearRegression()
regr.fit(X_train, Y_train)
print(regr.score(X_test, Y_test))
# Almost ~ 0.847 much more better then before

#Plot the new Result
y_pred = regr.predict(X_test)
plt.scatter(X_test, Y_test, color="b")
plt.plot(X_test, Y_pred, color='g')
plt.show()

In [None]:
# Evaluation metrics mostly used:
# 1) mean_absolute_error MAE
# 2) mean_squared_error MSE

from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_true=Y_test, y_pred=Y_pred)
mse = mean_squared_error(y_true=Y_test, y_pred=Y_pred)

print(f"MAR: {mae}\nMSE: {mse}")
