In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('./mnt/data/mini_project_1_data.csv')
df.head()

# Display basic information about the dataset
df.info()

# Check for missing values
df.isnull().sum()

# Summary statistics of the dataset
df.describe().T

# Check for duplicate entries
df.duplicated().sum()
df[df.duplicated(keep=False)].sort_values(by=list(df.columns))

# Drop duplicate entries
df.drop_duplicates(inplace=True)

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Create a figure with a specified size
plt.figure(figsize=(12, 8))

# Create a count plot showing the distribution of gender
count_plot = sns.countplot(x = 'weekday', data=df)
plt.title('Article Count per Weekday')
plt.xlabel('Day')  # Label for x-axis
plt.ylabel('Count')  # Label for y-axis
plt.xticks(rotation = 45)  # Rotate x-axis labels for better readability

# Annotate the bars with the article count value per weekday
for p in count_plot.patches:
    count_plot.annotate(format(p.get_height(), '.0f'), 
                        (p.get_x() + p.get_width()/2., 
                         p.get_height()), 
                        ha = 'center',  # Horizontal alignment
                        va = 'center',  # Vertical alignment
                        xytext = (0, 9), 
                        textcoords = 'offset points')

# Replace inconsistent weekday names and verify the count of how many students use each mode of transport
df['weekday'].replace({
    'moday': 'monday',
    'tuesdy': 'tuesday',
    'wednsday': 'wednesday',
    'thusdy': 'thursday',
    'frday': 'friday',
    'satday': 'saturday',
    'sunda': 'sunday'
}, inplace=True).str.capitalize()

plt.show()

# Create a count plot showing the distribution of gender
count_plot = sns.countplot(x = 'data_channel', data=df)
plt.title('Number of Articles per Channel')
plt.xlabel('Data channel')  # Label for x-axis
plt.ylabel('Number')  # Label for y-axis
plt.xticks(rotation = 45)  # Rotate x-axis labels for better readability

# Annotate the bars with the number of articles per channel
for p in count_plot.patches:
    count_plot.annotate(format(p.get_height(), '.0f'), 
                        (p.get_x() + p.get_width()/2., 
                         p.get_height()), 
                        ha = 'center',  # Horizontal alignment
                        va = 'center',  # Vertical alignment
                        xytext = (0, 9), 
                        textcoords = 'offset points')

# Replace inconsistent data_channel names and verify by counting how many students use each mode of transport
df['data_channel'].replace({
    'entertainment': 'Entertainment',
    'business': 'Business',
    'technology': 'Technology',
    'lifestyle': 'Lifestyle',
    'world': 'World',
    'social_media': 'Social Media'
}, inplace=True)

plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('./mnt/data/mini_project_1_data.csv')
df.head()

# Display basic information about the dataset
df.info()

# Check for missing values
df.isnull().sum()

# Summary statistics of the dataset
df.describe().T

# Check for duplicate entries
df.duplicated().sum()
df[df.duplicated(keep=False)].sort_values(by=list(df.columns))

# Drop duplicate entries
df.drop_duplicates(inplace=True)

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Create a figure with a specified size
plt.figure(figsize=(10, 6))

# Histogram for Target Variable
sns.histplot(df['shares'], bins=10, kde=True) # KDE adds smooth curve
plt.title('Article Shares') 
plt.xlabel('Shares') 
plt.ylabel('Frequency') 
plt.show()

# Histogram for Visual appeal; could influence sharing with the number of images/videos
sns.histplot(df['num_imgs'], bins=10, kde=True) # KDE adds smooth curve
plt.title('Distribution of Number of Images in Articles') 
plt.xlabel('Number of Images') 
plt.ylabel('Frequency') 
plt.show()

sns.histplot(df['num_imgs'], bins=10, kde=True) # KDE adds smooth curve
plt.title('Distribution of Number of Videos in Articles') 
plt.xlabel('Number of Videos') 
plt.ylabel('Frequency') 
plt.show()

# Histogram for Internal/External Linking Trends
sns.histplot(df['num_hrefs'], bins=10, kde=True) # KDE adds smooth curve
plt.title('Distribution of Hyperlinks in Articles') 
plt.xlabel('Number of Hyperlink') 
plt.ylabel('Frequency') 
plt.show()

sns.histplot(df['num_self_hrefs'], bins=10, kde=True) # KDE adds smooth curve
plt.title('Distribution of Self-References in Articles') 
plt.xlabel('Number of Self-References') 
plt.ylabel('Frequency') 
plt.show()

# Histogram for Article Length
plt.figure(figsize=(10, 6))
sns.histplot(df['n_tokens_content'], bins=10, kde=True) # KDE adds smooth curve
plt.title('Distribution of Article Length (in Tokens)') 
plt.xlabel('Number of Tokens') 
plt.ylabel('Frequency') 
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('./mnt/data/mini_project_1_data.csv')
df.head()

# Display basic information about the dataset
df.info()

# Check for missing values
df.isnull().sum()

# Summary statistics of the dataset
df.describe().T

# Check for duplicate entries
df.duplicated().sum()
df[df.duplicated(keep=False)].sort_values(by=list(df.columns))

# Drop duplicate entries
df.drop_duplicates(inplace=True)

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Create a figure with a specified size for Histogram
plt.figure(figsize=(10, 6))

# Histogram: Distribution of Shares by Weekday
sns.histplot(x = 'shares', hue = 'weekday', kde = True, bins = 18, alpha = 0.6)
plt.xscale('log')  # Apply log scale to handle skew
plt.title('Distribution of Shares by Weekday')
plt.xlabel('Shares')
plt.ylabel('Frequency')
plt.legend(title = 'Weekday')
plt.grid(True) # Enable grid lines for better readability
plt.show()

# Line Chart: Shares and Comments Over Time (Timely Trends [Timedelta])
# Sample 200 random points to avoid clutter
df_sample = df.sample(200, random_state=42).sort_values(by='timedelta')

# Identify top 10% articles by shares
top_shares = df[df['shares'] > df['shares'].quantile(0.90)]

# Create a figure with a specified size for Line Chart
plt.figure(figsize=(12, 6))

# Plot Shares over Timedelta
# plt.plot(df['timedelta'], df['shares'], label = 'Shares', marker = 'x') # Line Plot final test scores with cross markers
line1 = sns.lineplot(data=df, x='timedelta', y='shares', label='Shares', marker='x')

# Plot Comments over Timedelta
# plt.plot(df['timedelta'], df['n_comments'], label = 'Comments', marker = 'o') # Line Plot study hours with circle markers
line2 = sns.lineplot(data=df, x='timedelta', y='n_comments', label='Comments', marker='o')

# Annotate every 20th Share in Sample
for i, (x, y) in enumerate(zip(df_sample['timedelta'], df_sample['shares'])):
    if i % 20 == 0:
        plt.text(x, y, f'{int(y)}', ha='center', va='bottom', color='orange', fontsize=6)

# Annotate every 20th Comment in Sample
for i, (x, y) in enumerate(zip(df_sample['timedelta'], df_sample['n_comments'])):
    if i % 20 == 0:
        plt.text(x, y, f'{int(y)}', ha='center', va='bottom', color='blue', fontsize=6)

# Annotate top 10% viral articles with bold highlight
for x, y in zip(top_shares['timedelta'], top_shares['shares']):
    plt.text(x, y, f'{int(y)}', ha='center', va='bottom', color='violet', fontsize=6, fontweight='bold')

plt.title('Shares and Comments Over Timeline (Timedelta)')
plt.xlabel('Timedelta')
plt.ylabel('Count')
plt.legend() # Add legend to distinguish between the two lines
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust subplot parameters for better fit
plt.grid(True) # Enable grid lines for better readability
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('./mnt/data/mini_project_1_data.csv')
df.head()

# Display basic information about the dataset
df.info()

# Check for missing values
df.isnull().sum()

# Summary statistics of the dataset
df.describe().T

# Check for duplicate entries
df.duplicated().sum()
df[df.duplicated(keep=False)].sort_values(by=list(df.columns))

# Drop duplicate entries
df.drop_duplicates(inplace=True)

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Create a figure with a specified size for a Boxplot
plt.figure(figsize=(10, 8))

# Box plot for Day of Weekday
ordered_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
sns.boxplot(x='weekday', y='shares', data=df, order=ordered_days, palette='Set2')
plt.title('Which days tend to go viral shares')
plt.xlabel('Day')
plt.ylabel('Shares')
plt.grid(True)
plt.xticks(rotation=0)
plt.show()

# Box plot for Data Channel
sns.boxplot(x='data_channel', y='shares', data=df, palette='Set2')
plt.title('Which topics/categories get shared')
plt.xlabel('Topics/Categories')
plt.ylabel('Shares')
plt.grid(True)
plt.xticks(rotation=0)
plt.show()

# Box plot for Content Length
sns.boxplot(x=pd.qcut(df['n_tokens_content'], q=5, labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long']), y=df['shares'], palette='Set2')
plt.title('Shares Distribution Across Article Content Length (Quantile Grouping)')
plt.xlabel('n_tokens_content Quantile')
plt.ylabel('Shares')
plt.grid(True)
plt.xticks(rotation=0)
plt.show()

# Box plot for Comparison between older and newer articles with timedelta (article age)
df_sortedTime = df.sort_values('timedelta')
sns.boxplot(x=pd.qcut(df['timedelta'], q=5, labels=['Oldest', 'Older', 'Present', 'Newer', 'Newest']), y=df['shares'], palette='Set2')
plt.title('Shares Distribution Across Article Timedelta (Quantile Grouping)')
plt.xlabel('timedelta Quantile')
plt.ylabel('Shares')
plt.grid(True)
plt.xticks(rotation=0)
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('./mnt/data/mini_project_1_data.csv')
df.head()

# Display basic information about the dataset
df.info()

# Check for missing values
df.isnull().sum()

# Summary statistics of the dataset
df.describe().T

# Check for duplicate entries
df.duplicated().sum()
df[df.duplicated(keep=False)].sort_values(by=list(df.columns))

# Drop duplicate entries
df.drop_duplicates(inplace=True)

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Create a 2x2 subplot grid
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Prepare sorted categories for consistency
sorted_day = df['weekday'].value_counts().index
sorted_channel = df['data_channel'].value_counts().index

# ---------- First Subplot: Article Count per Weekday----------
sns.countplot(y='weekday', data=df, order=sorted_day, ax=axes[0, 0])
axes[0, 0].set_title('Article Count per Weekday')
axes[0, 0].set_xlabel('Count')
axes[0, 0].set_ylabel('Day of Week')

# Annotate bars
for p in axes[0, 0].patches:
    axes[0, 0].annotate(format(p.get_width(), '.0f'),
                        (p.get_width(), p.get_y() + p.get_height() / 2),
                        ha='left', va='center',
                        xytext=(5, 0),
                        textcoords='offset points')

# ---------- Second Subplot: Article Count per Data Channel----------
sns.countplot(y='data_channel', data=df, order=sorted_channel, ax=axes[0, 1])
axes[0, 1].set_title('Article Count per Data Channel')
axes[0, 1].set_xlabel('Count')
axes[0, 1].set_ylabel('Data Channel')

# Annotate bars
for p in axes[0, 1].patches:
    axes[0, 1].annotate(format(p.get_width(), '.0f'),
                        (p.get_width(), p.get_y() + p.get_height() / 2),
                        ha='left', va='center',
                        xytext=(5, 0),
                        textcoords='offset points')

# ---------- First Box Plot: Shares by Weekday ----------
sns.boxplot(y='weekday', x='shares', data=df, order=sorted_day, ax=axes[1, 0], palette="Set2")
axes[1, 0].set_title('Shares by Weekday')
axes[1, 0].set_xlabel('Shares')
axes[1, 0].set_ylabel('Day of Week')

# ---------- Second Box Plot: Shares by Data Channel ----------
sns.boxplot(y='data_channel', x='shares', data=df, order=sorted_channel, ax=axes[1, 1], palette="Set2")
axes[1, 1].set_title('Shares by Data Channel')
axes[1, 1].set_xlabel('Shares')
axes[1, 1].set_ylabel('Data Channel')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('./mnt/data/mini_project_1_data.csv')
df.head()

# Display basic information about the dataset
df.info()

# Check for missing values
df.isnull().sum()

# Summary statistics of the dataset
df.describe().T

# Check for duplicate entries
df.duplicated().sum()
df[df.duplicated(keep=False)].sort_values(by=list(df.columns))

# Drop duplicate entries
df.drop_duplicates(inplace=True)

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# ------- Scatter plot 1: Number of Images vs Shares, colored by Weekday --------
plt.figure(figsize=(12, 8))
sns.scatterplot(x='num_imgs', y='shares', data=df, hue='weekday', palette='Set2', alpha=0.7)
plt.title('Scatter Plot: Number of Images vs Shares by Weekday')
plt.xlabel('Number of Images')
plt.ylabel('Shares')
plt.legend(title='Weekday')
plt.show()

# ------- Scatter plot 2: Comments vs Shares, colored by Data Channel --------
plt.figure(figsize=(12, 8))
sns.scatterplot(x='n_comments', y='shares', data=df, hue='data_channel', palette='Set2', alpha=0.7)
plt.title('Scatter Plot: Comments vs Shares by Data Channel')
plt.xlabel('Number of Comments')
plt.ylabel('Shares')
plt.legend(title='Data Channel')
plt.show()

# ------- Scatter plot 3: Number of Videos vs Shares, colored by Data Channel ------
plt.figure(figsize=(12, 8))
sns.scatterplot(x='num_videos', y='shares', data=df, hue='data_channel', palette='Set2', alpha=0.7)
plt.title('Scatter Plot: Number of Videos vs Shares by Data Channel')
plt.xlabel('Number of Videos')
plt.ylabel('Shares')
plt.legend(title='Data Channel')
plt.show()

# ------- Scatter plot: Time Delta vs Shares, colored by Data Channel ------
plt.figure(figsize=(12, 8))
sns.scatterplot(x='timedelta', y='shares', data=df, hue='data_channel', palette='Set2', alpha=0.7)
plt.title('Scatter Plot: Time Delta vs Shares by Data Channel')
plt.xlabel('Time Delta')
plt.ylabel('Shares')
plt.legend(title='Data Channel')
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('./mnt/data/mini_project_1_data.csv')
df.head()

# Display basic information about the dataset
df.info()

# Check for missing values
df.isnull().sum()

# Summary statistics of the dataset
df.describe().T

# Check for duplicate entries
df.duplicated().sum()
df[df.duplicated(keep=False)].sort_values(by=list(df.columns))

# Drop duplicate entries
df.drop_duplicates(inplace=True)

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# ---------- Scenario A – Articles with low features but high shares ----------
outliers = df[
    (df['num_imgs'] == 0)  # No images
    & (df['n_tokens_content'] < df['n_tokens_content'].quantile(0.25))  # Short articles
    & (df['n_comments'] < df['n_comments'].quantile(0.25))  # Few comments
    & (df['shares'] > df['shares'].quantile(0.9))  # Still highly shared
]

# Display stats for Scenario A
outliers.describe().T
# Goal: Find articles with unexpectedly high popularity despite lacking common "success factors". 
# Insight: High shares despite short length, no images, and few comments suggest the impact of strong headlines or timely relevance.

# ---------- Scenario B – Viral Business Articles on Weekends ----------
unusual_outliers = df[
    (df['data_channel'] == 'Business') 
    & (df['weekday'].isin(['Saturday', 'Sunday'])) 
    & (df['shares'] > df['shares'].quantile(0.9))
]

# Display stats for Scenario B
unusual_outliers.describe().T
# Goal: Find which categories or weekdays produce "viral despite odds" articles.
# Insight: Unusually high shares for weekend business articles hint at special events or changing reader habits influencing engagement.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('./mnt/data/mini_project_1_data.csv')
df.head()

# Display basic information about the dataset
df.info()

# Check for missing values
df.isnull().sum()

# Summary statistics of the dataset
df.describe().T

# Check for duplicate entries
df.duplicated().sum()
df[df.duplicated(keep=False)].sort_values(by=list(df.columns))

# Drop duplicate entries
df.drop_duplicates(inplace=True)

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# ---------- Low Attendance, High Performance [Male/Female] ----------
# Filter the data for gender outliers
outliers = df[
    (df['gender'].isin(['Male', 'Female'])) 
    & (df['attendance_rate'] < 60) 
    & (df['final_test'] > 85)]

# Display the gender outliers
outliers
# Insight: Exception cases where students perform well despite missing classes

# Further filter the data for no-sibling outliers
no_sibling_outliers = df[
    (df['gender'].isin(['Male', 'Female']))
    & (df['number_of_siblings'] == 0)]

# Display the no-sibling outliers
no_sibling_outliers.describe().T
# Why: Find high-performing independent students who thrive despite not attending and having no siblings (fewer distractions?)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score, 
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report
)
from math import sqrt

# Load the dataset
df = pd.read_csv('/mnt/data/mini_project_1_data.csv') 

# Reset index after cleaning
df.reset_index(drop=True, inplace=True)

# Display basic information about the dataset
df.info(), df.head()

# Check for missing values
df.isnull().sum()

# Summary statistics of the dataset
df.describe().T

# Check for duplicate entries
df.duplicated().sum()
duplicates = df[df.duplicated(keep=False)].sort_values(by=list(df.columns))

# Drop duplicate entries
df.drop_duplicates(inplace=True)

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Basic cleanup
df['high_share'] = (df['shares'] > df['shares'].median()).astype(int)
df['weekday'] = df['weekday'].str.strip().str.lower().str.capitalize()
df['data_channel'] = df['data_channel'].fillna('Unknown').str.strip().str.lower().str.replace('_', ' ').str.capitalize()

# Prepare features
x = df.drop(columns=['shares', 'ID', 'URL'])
y = df['high_share']

# Drop rows where any x value is missing
x = x.copy()
x = x.dropna()
y = y.loc[x.index]  # Align y accordingly

# Feature groups
num_features = ['price', 'num_adults', 'num_children', 'arrival_day', 'checkout_day']
cat_features = ['branch', 'platform', 'room', 'country', 'first_time']

num_features = x.select_dtypes(include='number').columns.tolist()
cat_features = x.select_dtypes(include='object').columns.tolist()

# Preprocessor and pipeline setup
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# Retrieve feature names after preprocessing
preprocessor.fit(x)
feature_names = num_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(cat_features))

# Pipelines
cls_pipeline = Pipeline([('Preprocessor', preprocessor), ('Classification', LogisticRegression(max_iter=1000))])
linear_pipeline = Pipeline([('Preprocessor', preprocessor), ('LinearRegression', LinearRegression())])
ridge_pipeline = Pipeline([('Preprocessor', preprocessor), ('RidgeRegression', Ridge(alpha=1))])
lasso_pipeline = Pipeline([('Preprocessor', preprocessor), ('LassoRegression', Lasso(alpha=1))])

# Train-test Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Fit the models
cls_pipeline.fit(x_train, y_train)
linear_pipeline.fit(x_train, y_train)
ridge_pipeline.fit(x_train, y_train)
lasso_pipeline.fit(x_train, y_train)

# Predictions
y_pred_cls = cls_pipeline.predict(x_test)
y_pred_linear = linear_pipeline.predict(x_test)
y_pred_ridge = ridge_pipeline.predict(x_test)
y_pred_lasso = lasso_pipeline.predict(x_test)

# Logistic Regression (Classification) metrics
cls_metrics = {
    "Report": classification_report(y_test, y_pred_cls, output_dict=True),
    "Accuracy": accuracy_score(y_test, y_pred_cls),
    "Precision": precision_score(y_test, y_pred_cls),
    "Recall": recall_score(y_test, y_pred_cls),
    "F1 Score": f1_score(y_test, y_pred_cls)
}


# Grid search hyperparameter tuning
param_grid_cls = {
    'Classification__penalty': ['l2'], 'Classification__C': [0.1, 1, 10, 100, 1000],
    'Classification__fit_intercept': [True, False], 'Classification__solver': ['lbfgs']
}

param_grid = {
    'RidgeRegression__alpha': [0.1, 1, 10, 100, 1000], 'RidgeRegression__fit_intercept': [True, False],
    'LassoRegression__alpha': [0.1, 1, 10, 100, 1000], 'LassoRegression__fit_intercept': [True, False]
}

# Grid Search
cls_grid = GridSearchCV(cls_pipeline, param_grid_cls, cv=5, scoring='accuracy')
ridge_grid = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
lasso_grid = GridSearchCV(lasso_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)

cls_grid.fit(x_train, y_train)
ridge_grid.fit(x_train, y_train)
lasso_grid.fit(x_train, y_train)

# Evaluate Best Regression on test set
best_cls = cls_grid.best_estimator_
best_ridge = ridge_grid.best_estimator_
best_lasso = lasso_grid.best_estimator_

# Evaluate Best Prediction
y_pred_best_cls = best_cls.predict(x_test)
y_pred_best_ridge = best_ridge.predict(x_test)
y_pred_best_lasso = best_lasso.predict(x_test)

print("Best Logistic (Classification) (GridSearchCV):", cls_grid.best_params_)
print("Best Ridge (GridSearchCV):", ridge_grid.best_params_)
print("Best Lasso (GridSearchCV):", lasso_grid.best_params_)


# Randomized Grid Search
cls_random = RandomizedSearchCV(cls_pipeline, param_distributions=param_grid_cls, cv=5, scoring='accuracy')
ridge_random = RandomizedSearchCV(ridge_pipeline, param_distributions=param_grid, n_iter=10, cv=5, scoring='r2', random_state=42, n_jobs=-1)
lasso_random = RandomizedSearchCV(lasso_pipeline, param_distributions=param_grid, n_iter=10, cv=5, scoring='r2', random_state=42, n_jobs=-1)

cls_random.fit(x_train, y_train)
ridge_random.fit(x_train, y_train)
lasso_random.fit(x_train, y_train)

# Evaluate Best Randomised Regression on test set
bestRM_cls = cls_random.best_estimator_
bestRM_ridge = ridge_random.best_estimator_
bestRM_lasso = lasso_random.best_estimator_

y_pred_bestRM_cls = bestRM_cls.predict(x_test)
y_pred_bestRM_ridge = bestRM_ridge.predict(x_test)
y_pred_bestRM_lasso = bestRM_lasso.predict(x_test)

print("Best Logistic (Classification) (Randomized):", cls_random.best_params_)
print("Best Ridge (Randomized):", ridge_random.best_params_)
print("Best Lasso (Randomized):", lasso_random.best_params_)


# Evaluation function
def evaluate_cls(name, y_true, y_pred):
    print(f"{name} Classification Report:\n", classification_report(y_true, y_pred))
    print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y_true, y_pred):.4f}\n")

def evaluate_reg(name, y_true, y_pred):
    print(f"{name} Regression Metrics:")
    print(f"MAE:  {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"MSE:  {mean_squared_error(y_true, y_pred):.4f}")
    print(f"RMSE: {sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print(f"R²:   {r2_score(y_true, y_pred):.4f}\n")

# Evaluate models
evaluate_cls("Logistic", y_test, y_pred_cls)

print("------ Evaluation (Untuned) ------")
evaluate_reg("Ridge", y_test, y_pred_ridge)
evaluate_reg("Lasso", y_test, y_pred_lasso)

print("------ Best Evaluation (GridSearch) ------")
evaluate_reg("Best Ridge", y_test, y_pred_best_ridge)
evaluate_reg("Best Lasso", y_test, y_pred_best_lasso)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_cls)

# Coefficient
cls_coefs = cls_pipeline.named_steps['Classification'].coef_.flatten()
linear_coefs = linear_pipeline.named_steps['LinearRegression'].coef_
ridge_coefs = ridge_pipeline.named_steps['RidgeRegression'].coef_
lasso_coefs = lasso_pipeline.named_steps['LassoRegression'].coef_

coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Logistic [Classification]': cls_coefs,
    'Linear Regression': linear_coefs,
    'Ridge Regression': ridge_coefs,
    'Lasso Regression': lasso_coefs
}).melt(id_vars='Feature', var_name='Model', value_name='Coefficient').sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(12, 10))
sns.barplot(data=coef_df, x='Feature', y='Coefficient', hue='Model', palette=['red', 'blue', 'green', 'yellow'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Feature')
plt.ylabel('Coefficient')
plt.title('Regression (& Classification) Coefficients')
plt.legend(loc='upper left')
plt.tight_layout()
plt.grid(True, axis='y')
plt.show()


# Final test set evaluation
y_test_pred_linear = best_linear.predict(x_test)
test_MAE_Linear = mean_absolute_error(y_test, y_test_pred_linear)
test_MSE_Linear = mean_squared_error(y_test, y_test_pred_linear)
test_RMSE_Linear = sqrt(mean_squared_error(y_test, y_test_pred_linear))
test_R2_Linear = r2_score(y_test, y_test_pred_linear)

y_test_pred_ridge = best_ridge.predict(x_test)
test_MAE_Ridge = mean_absolute_error(y_test, y_test_pred_ridge)
test_MSE_Ridge = mean_squared_error(y_test, y_test_pred_ridge)
test_RMSE_Ridge = sqrt(mean_squared_error(y_test, y_test_pred_ridge))
test_R2_Ridge = r2_score(y_test, y_test_pred_ridge)

y_test_pred_lasso = best_lasso.predict(x_test)
test_MAE_Lasso = mean_absolute_error(y_test, y_test_pred_lasso)
test_MSE_Lasso = mean_squared_error(y_test, y_test_pred_lasso)
test_RMSE_Lasso = sqrt(mean_squared_error(y_test, y_test_pred_lasso))
test_R2_Lasso = r2_score(y_test, y_test_pred_lasso)


# Evaluate the model
print("Best Linear Regression Report, Final Test Metrics:")
print(f"Final Test MAE: {test_MAE_Linear:.4f}")
print(f"Final Test MSE: {test_MSE_Linear:.4f}")
print(f"Final Test RMSE: {test_RMSE_Linear:.4f}")
print(f"Final Test R²: {test_R2_Linear:.4f}")

print("Best Ridge Regression Report, Final Test Metrics:")
print(f"Final Test MAE: {test_MAE_Ridge:.4f}")
print(f"Final Test MSE: {test_MSE_Ridge:.4f}")
print(f"Final Test RMSE: {test_RMSE_Ridge:.4f}")
print(f"Final Test R²: {test_R2_Ridge:.4f}")

print("Best Lasso Regression Report, Final Test Metrics:")
print(f"Final Test MAE: {test_MAE_Lasso:.4f}")
print(f"Final Test MSE: {test_MSE_Lasso:.4f}")
print(f"Final Test RMSE: {test_RMSE_Lasso:.4f}")
print(f"Final Test R²: {test_R2_Lasso:.4f}")


# Confusion Matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()