# Regression in Scikit Learn

In [None]:
# import 4 basic libraries
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# train test split is must for ML models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
df = sns.load_dataset('titanic')
df.head()

In [None]:
df_linear = df[['age', 'fare']].dropna()

In [None]:
# simple linear regression model assumptions 
# 1. Only linear relationship between the independent and dependent variables
# 2. Only two variables are used in the model
X = df[['age']]
y = df['fare']

In [None]:
# fillna to fill values 
X['age'].fillna(X['age'].mean(),inplace=True)

In [None]:
# percentage of missing values in the dataset
X.isnull().sum() / len(X) * 100

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# lm plot 
sns.lmplot(x='age', y='fare', data=df_linear)

## Check outlier by Box plot 

In [None]:
sns.boxplot(y='fare', data=df_linear)

In [None]:
df_linear = df_linear[(df_linear['age']>5) & (df_linear['age']<55)]

In [None]:
sns.boxplot(y='age', data=df_linear)

In [None]:
df_linear = df_linear[(df_linear['fare']>0) & (df_linear['fare']<41)]

In [None]:
sns.boxplot(y='fare',data=df_linear)

In [None]:
sns.lmplot(x='age', y='fare', data=df_linear)

In [None]:
X = df_linear[['age']]
y = df_linear['fare']
model = LinearRegression()
model.fit(X, y)
model.predict([[25]])

## 4 Model Evaluation with metrics 
### R2
### MAE
### RMSE
### mean square error

In [None]:
X = df_linear[['age']]
y = df_linear['fare']
model = LinearRegression()
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# fit the model on training data
model.fit(X_train, y_train)
# predict on test data
y_pred = model.predict(X_test)
# evaluate the model
sns.scatterplot(x=y_test, y=y_pred)

In [None]:
model.score(X_test, y_test)

In [50]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

MAE = mean_absolute_error(y_true=y_test,y_pred=y_pred) # MAE is the mean absolute error, which is the average of the absolute differences between predicted and actual values
MSE = mean_squared_error(y_true=y_test,y_pred=y_pred) # MSE is the mean squared error, which is the average of the squared differences between predicted and actual values
RMSE = np.sqrt(MSE) # RMSE is the square root of MSE, so we can also calculate it directly
R2 = r2_score(y_true=y_test,y_pred=y_pred) # R2 is the coefficient of determination, which indicates how well the model explains the variance in the data



In [51]:
print(f'MSE: {MSE}')
print(f'MAE:{MAE}')
print(f'RMSE:{RMSE}')
print(f'R2:{R2}')

MSE: 75.47862585668436
MAE:7.356399403049273
RMSE:8.687843567691834
R2:-0.01940614554645026
