<a href="https://colab.research.google.com/github/Arjun-vidyasagar/Entri_Capstone_Project/blob/main/D44_Mar_25_2025_ML_8_linear_regression_Practise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
# for scaling
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression # for linear regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score # for evaluation

In [None]:
# loading the dataset
bs = pd.read_csv('beer-servings.csv', index_col = 0)
bs

In [None]:
bs.info()

In [None]:
# checking for duplicates
bs.duplicated().sum()

In [None]:
bs['country'].nunique()

In [None]:
bs['continent'].nunique()

In [None]:
bs['continent'].unique()

In [None]:
bs['continent'].value_counts()

In [None]:
# dropping the country column
bs = bs.drop(columns = ['country'])
bs.head()

In [None]:
# checking for missing values
bs.isnull().sum()

In [None]:
len(bs)

In [None]:
miss_perc = (bs.isnull().sum()/len(bs))*100
miss_perc

In [None]:
# filling the missing values with median of each column
for col in ['beer_servings','spirit_servings','wine_servings','total_litres_of_pure_alcohol']:
    bs[col] = bs[col].fillna(bs[col].median())

In [None]:
miss_perc_after = (bs.isnull().sum()/len(bs))*100
miss_perc_after

In [None]:
bs.isnull().sum()

In [None]:
# Statistics

bs.describe()

In [None]:
# Visualizing target variable
sns.histplot(bs['total_litres_of_pure_alcohol'], bins=10, color='green', kde=True)
plt.xlabel('Total Litres of Pure Alcohol')
plt.ylabel('Frequency')
plt.title('Distribution of Target Variable')
plt.show()

In [None]:
# Checking for outliers
numerical_columns = ['beer_servings', 'spirit_servings', 'wine_servings']
plt.figure(figsize=(10,6))
sns.boxplot(data=bs[numerical_columns])
plt.title('Outlier Detection')
plt.show()

In [None]:
# Treating the Outlier using IQR
outlier_columns = ['spirit_servings', 'wine_servings']

# Calculate the quartiles and IQR for the outlier columns
Q1 = bs[outlier_columns].quantile(0.25)
Q3 = bs[outlier_columns].quantile(0.75)
IQR = Q3 - Q1

# Outlier treatment using IQR method (capping at 1.5 times IQR)
outliers_lower = bs[outlier_columns] < (Q1 - 1.5 * IQR)
outliers_upper = bs[outlier_columns] > (Q3 + 1.5 * IQR)

# Cap the outliers to 1.5 times IQR
bs[outlier_columns] = bs[outlier_columns].clip(lower=Q1 - 1.5 * IQR, upper=Q3 + 1.5 * IQR, axis = 1)

# any below the limit and over the limit is brought to q1 - 1.5*IQR and q3 + 1.5*IQR

In [None]:
# Confirm outliers removed
outlier_recheck = ['spirit_servings','wine_servings']
plt.figure(figsize = (10,7))
sns.boxplot(data=bs[outlier_recheck])
plt.title('for outliers checking')
plt.xlabel('Columns')
plt.ylabel('Values')
plt.xticks(rotation=45, fontsize = 20)
plt.show()

In [None]:
bs[['spirit_servings','wine_servings']].describe()

In [None]:
bs.head()

In [None]:
bs.shape

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# Initializing OneHotEncoder
encoder = OneHotEncoder()

In [None]:
# Fitting and transforming the 'continent' column
encoded_continent = encoder.fit_transform(bs[['continent']]).toarray()

# Getting feature names for the encoded columns
encoded_continent_bs = pd.DataFrame(encoded_continent, columns=encoder.get_feature_names_out(['continent']))


# Concatenating the encoded columns with the original dataframe
one_hot_encoded_bs = pd.concat([bs, encoded_continent_bs], axis=1).drop('continent', axis=1)

In [None]:
one_hot_encoded_bs

In [None]:
# Scaling

numerical_columns = ['beer_servings', 'spirit_servings', 'wine_servings']
# Feature scaling
scaler = StandardScaler()
one_hot_encoded_bs[numerical_columns] = scaler.fit_transform(one_hot_encoded_bs[numerical_columns])

In [None]:
one_hot_encoded_bs.head(7)

In [None]:
one_hot_encoded_bs['beer_servings'].std()

In [None]:
one_hot_encoded_bs[numerical_columns].skew()

In [None]:
# Splitting dataset
X = one_hot_encoded_bs.drop(columns=['total_litres_of_pure_alcohol'])
y = one_hot_encoded_bs['total_litres_of_pure_alcohol']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
154+39

In [None]:
X_train.head()

In [None]:
y.head()

In [None]:
# Train Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

In [None]:
# Function to evaluate model
def evaluate_model(y_actual, y_pred, model_name):
    print(f"{model_name} Performance:")
    print("MAE:", mean_absolute_error(y_actual, y_pred))
    print("MSE:", mean_squared_error(y_actual, y_pred))
    print("R2 Score:", r2_score(y_actual, y_pred))
    print("\n")

In [None]:
mean_squared_error(y_test, lr_predictions)

In [None]:
evaluate_model(y_test, lr_predictions,'Linear Regression')

In [None]:
# Function to visualize actual vs predicted values
def visualize_predictions(y_actual, y_pred, model_name):
    results_df = pd.DataFrame({'Actual': y_actual.values, 'Predicted': y_pred})
    print(results_df.head(30))

    plt.figure(figsize=(8,5))
    sns.scatterplot(x=y_actual, y=y_pred, color='blue', alpha=0.6)
    plt.plot([y_actual.min(), y_actual.max()], [y_actual.min(), y_actual.max()], color='red', linestyle='--')
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title(f"Actual vs Predicted - {model_name}")
    plt.show()

In [None]:
visualize_predictions(y_test, lr_predictions, "Linear Regression")

In [None]:
bs.columns

In [None]:
# predicting for nerw data

# 88, 130, 54 , 4.9(answer)

unseen_data = pd.DataFrame([[88, 130, 54, 'Europe']],
                           columns = ['beer_servings', 'spirit_servings', 'wine_servings','continent'])

In [None]:
unseen_data

In [None]:
# Encoding unseen data
encoded_unseen = encoder.transform(unseen_data[['continent']]).toarray()
unseen_data_encoded = pd.DataFrame(encoded_unseen, columns=encoder.get_feature_names_out(['continent']))
unseen_data = pd.concat([unseen_data.drop(columns=['continent']), unseen_data_encoded], axis=1)

In [None]:
unseen_data

In [None]:
unseen_data[numerical_columns]

In [None]:
# Apply Feature Scaling
unseen_data[numerical_columns] = scaler.transform(unseen_data[numerical_columns])
unseen_data

In [None]:
# Make prediction
unseen_prediction = lr_model.predict(unseen_data)
print("Predicted Value for Unseen Data:", unseen_prediction)