In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Available styles
# ['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressors
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
data = pd.read_csv("/Users/aniketindulkar/Documents/GitHub/MLProjects/InstagramAnalysis/Instagram.csv",
                    encoding = 'latin1')
data = data.dropna()

In [None]:
data.head()
print(data.columns)

In [None]:
# Set the style globally for all plots
plt.style.use('seaborn-v0_8')

# Initialize a figure with 3 subplots (vertically arranged)
fig, axs = plt.subplots(2, 2, figsize=(15, 10))  # 2 rows, 2 column

# Plot "Distribution of Impressions From Home"
sns.histplot(data['From Home'], ax=axs[0,0], kde=False)
axs[0,0].set_title("Distribution of Impressions From Home")

# Plot "Distribution of Impressions From Hashtags"
sns.histplot(data['From Hashtags'], ax=axs[0,1], kde=False)
axs[0,1].set_title("Distribution of Impressions From Hashtags")

# Plot "Distribution of Impressions From Explore"
sns.histplot(data['From Explore'], ax=axs[1,0], kde=False)
axs[1,0].set_title("Distribution of Impressions From Explore")

# Plot "Distribution of Impressions From Explore"
sns.histplot(data['From Other'], ax=axs[1,1], kde=False)
axs[1,1].set_title("Distribution of Impressions From Explore")

# Adjust the layout
plt.tight_layout()

# Display the combined plot
plt.show()

In [None]:
# Sum the impressions coming from the 'From Home' column of the DataFrame
home = data["From Home"].sum()

# Sum the impressions coming from the 'From Hashtags' column of the DataFrame
hashtags = data["From Hashtags"].sum()

# Sum the impressions coming from the 'From Explore' column of the DataFrame
explore = data["From Explore"].sum()

# Sum the impressions coming from the 'Other' sources column of the DataFrame
other = data["From Other"].sum()

# Define the labels for the pie chart segments
labels = ['From Home', 'From Hashtags', 'From Explore', 'Other']

# Aggregate the summed values into a list that corresponds to the labels
values = [home, hashtags, explore, other]

# Create a pie chart using Plotly Express
# - `data`: (Although not directly used for plotting, passed contextually)
# - `values`: data points for each segment of the pie chart
# - `names`: labels for each pie chart segment
# - `title`: title of the chart
# - `hole`: creates a donut-like pie chart with the hole's radius set to 0.5
fig = px.pie(data, values=values, names=labels, 
             title='Impressions on Instagram Posts From Various Sources', hole=0.5)

# Display the figure
fig.show()

In [None]:
# Join all entries in the 'Caption' column into a single string separated by spaces
text = " ".join(i for i in data.Caption)

# Create a set of stopwords that the WordCloud generator will ignore
# Stopwords are common words that do not carry significant meaning, such as "the", "and", etc.
stopwords = set(STOPWORDS)

# Initialize the WordCloud object with stopwords and background color set to white
# `generate(text)` constructs the word cloud from the provided text string
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Set the style of plotting to 'classic' for a more traditional look with no gridlines or background color
plt.style.use('classic')

# Create a figure object with the specified size in inches (width, height)
plt.figure(figsize=(12,10))

# Display the word cloud image with 'bilinear' interpolation which smooths the displayed image
plt.imshow(wordcloud, interpolation='bilinear')

# Remove the x and y axis labels to create a cleaner look
plt.axis("off")

# Display the plot with the word cloud
plt.show()


In [None]:
text = " ".join(i for i in data.Hashtags)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.figure( figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

In [None]:
figure = px.scatter(data_frame = data, x="Impressions",
                    y="Likes", size="Likes", trendline="ols", 
                    title = "Relationship Between Likes and Impressions")
figure.show()

In [None]:
figure = px.scatter(data_frame = data, x="Impressions",
                    y="Comments", size="Comments", trendline="ols", 
                    title = "Relationship Between Comments and Total Impressions")
figure.show()

In [None]:
figure = px.scatter(data_frame = data, x="Impressions",
                    y="Shares", size="Shares", trendline="ols", 
                    title = "Relationship Between Shares and Total Impressions")
figure.show()

In [None]:
figure = px.scatter(data_frame = data, x="Impressions",
                    y="Saves", size="Saves", trendline="ols", 
                    title = "Relationship Between Post Saves and Total Impressions")
figure.show()

In [None]:
data_numeric = data.apply(pd.to_numeric, errors='coerce')
correlation = data_numeric.corr()
print(correlation)

In [None]:
conversion_rate = (data["Follows"].sum() / data["Profile Visits"].sum()) * 100
print(conversion_rate)

In [None]:
figure = px.scatter(data_frame = data, x="Profile Visits",
                    y="Follows", size="Follows", trendline="ols", 
                    title = "Relationship Between Profile Visits and Followers Gained")
figure.show()

In [None]:
# Convert selected columns into a NumPy array to be used as features for a machine learning model
# This includes 'Likes', 'Saves', 'Comments', 'Shares', 'Profile Visits', and 'Follows' from the DataFrame
x = np.array(data[['Likes', 'Saves', 'Comments', 'Shares', 'Profile Visits', 'Follows']])

# Convert the 'Impressions' column into a NumPy array to be used as the target variable for the model
y = np.array(data["Impressions"])

# Split the dataset into training and testing sets using sklearn's train_test_split function
# The dataset is split into 80% training data and 20% testing data
# 'test_size=0.2' specifies that 20% of the data should be used for the test set
# 'random_state=42' is set to ensure reproducibility of results, as it controls the shuffling applied to the data before the split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

    

**Passive Aggressive Regressor** is typically used for large-scale learning and is particularly well-suited for scenarios where you have continuously arriving data (streaming data). It is called "passive-aggressive" because the algorithm remains passive for correct predictions (i.e., when the prediction is correct or the error is within a margin), but turns aggressive when a prediction is incorrect, making adjustments to correct the mistake.

### Key Characteristics:
- **Online Learning**: This model updates continuously as new data arrives, making it suitable for systems that ingest data in a streaming fashion.
- **Adaptability**: The model can quickly adapt to new patterns in the data, which makes it robust in environments where data patterns can change over time.
- **Margin Infraction**: The model updates itself not only when it predicts incorrectly but also when the prediction is not confident enough (i.e., when the prediction does not exceed a margin threshold).
- **Usage**: It's commonly used in tasks like real-time decision-making and situations where the model needs to quickly adapt to new data without retraining from scratch.

The **PassiveAggressiveRegressor** is used here to predict a continuous target variable based on several input features. The fitting process adjusts the model weights based on the training data, and the performance is evaluated using the R^2 statistic, which measures the proportion of variance in the dependent variable that is predictable from the independent variables. This score can range from -∞ to 1, where a value closer to 1 indicates a better fit to the data.


In [None]:
# Initialize the Passive Aggressive Regressor model
model = PassiveAggressiveRegressor()

# Fit the model on the training dataset
# This step involves the model learning the relationship between the input features (xtrain) and the target variable (ytrain)
model.fit(xtrain, ytrain)

# Evaluate the model on the testing dataset
# The score method returns the coefficient of determination R^2 of the prediction,
# which is a measure of how well the unseen samples are predicted by the model
score = model.score(xtest, ytest)
print("Model R^2 Score:", score)

### Linear Regression Model

**Linear Regression** is a foundational statistical method used in machine learning for predictive modeling. It aims to model the relationship between a scalar dependent variable `y` and one or more independent variables (or "predictors") denoted `X`.

#### How It Works:
- The model works by estimating coefficients for the linear equation, involving one or more independent variables that best predict the dependent variable.
- The linear equation can be expressed as:
  \[ y = \beta_0 + \beta_1x_1 + \beta_2x_2 + ... + \beta_nx_n + \epsilon \]
  where \( \beta_0, \beta_1, ..., \beta_n \) are coefficients, and \( \epsilon \) is the error term.

#### Key Characteristics:
- **Simplicity**: Linear regression is straightforward to understand and explain, making it a good starting point for predictive modeling.
- **Interpretability**: Each coefficient in the model explains the influence of one independent variable on the dependent variable.
- **Assumptions**: The effectiveness of the model depends on whether certain assumptions are met, including linearity, independence, homoscedasticity, and normal distribution of errors.

#### Usage:
- It is widely used in both statistics and machine learning for tasks such as trend forecasting, determining the strength of predictors, and forecasting an effect.
- Common applications include economics (predicting economic growth), business (forecasting sales), and health sciences (predicting outcomes of treatments).

#### Evaluation:
- The model's performance can be evaluated using the R² statistic, which measures the proportion of variance in the dependent variable that is predictable from the independent variables. A higher R² indicates a better fit to the data.

This model is particularly useful when there is a linear relationship between the input variables and the target output. Despite its simplicity, linear regression can provide powerful insights into data and is a staple in the toolbox of any data analyst or scientist.

In [None]:
# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the training dataset
# This step involves training the model by finding the best coefficients for the input features that minimize the error in predicting the target variable
model.fit(xtrain, ytrain)

# Evaluate the model on the testing dataset
# The score method returns the coefficient of determination R^2 of the prediction,
# which measures the proportion of variance in the dependent variable that is predictable from the independent variables
model_score = model.score(xtest, ytest)
print("Model R^2 Score:", model_score)


### Ridge Regression Model

**Ridge Regression**, also known as Tikhonov regularization, is a technique used for analyzing multiple regression data that suffer from multicollinearity. By introducing a degree of bias to the regression estimates, Ridge Regression reduces the standard errors.

#### How It Works:
- Ridge Regression adds a penalty equivalent to the square of the magnitude of the coefficients to the least squares cost function. This penalty is parameterized by `alpha`.
- The mathematical formulation is:
  \[ J(\beta) = ||Y - X\beta||^2 + \alpha||\beta||^2 \]
  where \( J(\beta) \) is the cost function, \( \beta \) represents the coefficient matrix, \( X \) is the feature matrix, \( Y \) is the target vector, and \( \alpha \) is the regularization term.

#### Key Characteristics:
- **Regularization**: The key feature of Ridge Regression is its ability to reduce model complexity by penalizing large coefficients through the `alpha` parameter.
- **Shrinkage**: As `alpha` increases, the flexibility of the Ridge model decreases, leading to a decrease in variance but a slight increase in bias.
- **Multicollinearity**: It is particularly useful when data features are multicollinear or when the feature dimension is high relative to the number of data points.

#### Usage:
- Commonly used in scenarios where the number of predictor variables in a set exceeds the number of observations, or when a data set has multicollinearity (i.e., independent variables are highly correlated).
- Employed in various fields including economics, biology, and engineering, particularly where predictive accuracy and robustness against multicollinearity are important.

#### Evaluation:
- The effectiveness of Ridge Regression is typically measured by the R² statistic on the test data, which provides an indication of how well unseen samples are likely to be predicted.
- The choice of `alpha` significantly affects the model’s performance, and it is often selected via cross-validation.

Ridge Regression offers a robust alternative to standard linear regression, especially useful in situations where a simple least squares estimate proves inadequate or unstable.


In [None]:
# Initialize the Ridge Regression model with a regularization strength of alpha=1.0
# Alpha is a parameter that controls the amount of shrinkage: the larger the value of alpha, the greater the amount of shrinkage
model = Ridge(alpha=1.0)

# Fit the Ridge Regression model on the training dataset
# This involves adjusting the model parameters to minimize the regularized loss function, which includes a penalty for the size of the coefficients
model.fit(xtrain, ytrain)

# Evaluate the model on the testing dataset
# The score method returns the coefficient of determination R^2 of the prediction,
# indicating the proportion of variance in the dependent variable predictable from the independent variables
model_score = model.score(xtest, ytest)
print("Model R^2 Score:", model_score)

### Lasso Regression Model

**Lasso Regression** (Least Absolute Shrinkage and Selection Operator) is a type of linear regression that uses shrinkage. Shrinkage is where data values are shrunk towards a central point, like the mean. The Lasso procedures encourage simple, sparse models (i.e., models with fewer parameters).

#### How It Works:
- Lasso Regression adds a penalty equal to the absolute value of the magnitude of coefficients to the loss function. This type of regularization (L1) can lead to zero coefficients in some variables, effectively performing variable selection.
- The mathematical formulation is:
  \[ J(\beta) = \frac{1}{2n} ||Y - X\beta||^2 + \alpha||\beta||_1 \]
  where \( J(\beta) \) is the cost function, \( \beta \) represents the coefficient matrix, \( X \) is the feature matrix, \( Y \) is the target vector, \( n \) is the number of samples, and \( \alpha \) is the regularization strength.

#### Key Characteristics:
- **Variable Selection**: Lasso Regression can yield sparse models where only a subset of the predictors are used, which is useful for models that benefit from variable reduction.
- **Regularization**: By adjusting `alpha`, users can control the impact of the penalty on the model complexity. Higher values of `alpha` force more coefficients to zero.
- **Multicollinearity Handling**: Similar to Ridge Regression, Lasso can manage multicollinearity among predictors, although its method of shrinking coefficients can differ significantly.

#### Usage:
- Lasso is widely used when many features are present, but only some of them are expected to be important for prediction. This is typical in fields like genomics where the number of predictors (genes) can be very large compared to the number of observations.
- Employed in various predictive models where model simplicity and interpretability are essential.

#### Evaluation:
- The performance of Lasso Regression is also measured by the R² statistic on the test data, giving a sense of fit quality.
- The selection of `alpha` is critical and can be fine-tuned using techniques like cross-validation.

Lasso Regression is particularly useful for creating parsimonious models in situations where a simple and interpretable solution is preferable to a complex one. Its ability to reduce the number of predictor variables by setting some coefficient estimates to zero helps in identifying the most significant features.


In [None]:
# Initialize the Lasso Regression model with a regularization strength of alpha=0.1
# Alpha is a parameter that controls the amount of shrinkage: the larger the value of alpha, 
# the greater the amount of shrinkage and thus the more coefficients are driven to zero
model = Lasso(alpha=0.1)

# Fit the Lasso Regression model on the training dataset
# This involves adjusting the model parameters to minimize the loss function,
# which includes a penalty proportional to the absolute value of the coefficients
model.fit(xtrain, ytrain)

# Evaluate the model on the testing dataset
# The score method returns the coefficient of determination R^2 of the prediction,
# which indicates how well the model performs compared to a simple mean of the target values
model_score = model.score(xtest, ytest)
print("Model R^2 Score:", model_score)


### Decision Tree Regression Model

**Decision Tree Regression** utilizes a decision tree (a flowchart-like structure) to model the decision-making process over continuous data. It is a non-linear model that is powerful for capturing complex patterns in data sets, particularly useful when linear assumptions do not hold.

#### How It Works:
- Decision Tree models split the input data into subsets based on decisions made upon feature values. These decisions are represented as nodes in the tree.
- The tree is constructed by recursively partitioning data into branches, which represents an inference about the target values to be predicted.
- The mathematical goal is to reduce variability in predictions by splitting the dataset on the feature that results in the highest decrease in sum of squared error (SSE).

#### Key Characteristics:
- **Interpretability**: One of the most significant advantages of decision trees is their ease of interpretation. They can be visualized graphically and understood even by non-experts.
- **Non-parametric**: They do not assume any distribution of the data, making them suitable for non-linear data patterns.
- **Control Over-fitting**: `max_depth` controls the size of the tree to prevent overfitting. Smaller trees are less complex and generalize better.

#### Usage:
- Decision trees are widely used in real-world applications where relationships between features and outcomes are complex and non-linear.
- They are popular in finance for pricing options, in energy for predicting load, and in operations for predicting failure times of machines.

#### Evaluation:
- The model’s performance is evaluated using the R² statistic, which measures the proportion of variance in the dependent variable that is predictable from the independent variables.
- Performance can be further enhanced by pruning the tree after it has been built, which involves removing parts of the tree that provide little power in predicting target variables.

Decision Tree Regression is useful in scenarios where data relationships are intricate and an understandable model is crucial for decision making. Its ability to break down data hierarchically fits well with human decision-making processes.


In [None]:
# Initialize the Decision Tree Regressor model with a maximum depth of 5
# The `max_depth` parameter controls the maximum number of levels in the tree, which helps prevent overfitting
model = DecisionTreeRegressor(max_depth=5)

# Fit the Decision Tree Regressor model on the training dataset
# This involves building a tree that models the relationship between the input features and the target variable
model.fit(xtrain, ytrain)

# Evaluate the model on the testing dataset
# The score method returns the coefficient of determination R^2 of the prediction,
# indicating how well the predictions approximate the true data values
model_score = model.score(xtest, ytest)
print("Model R^2 Score:", model_score)

### Support Vector Regression (SVR) Model

**Support Vector Regression (SVR)** applies the principles of support vector machines (SVM) to regression problems. It attempts to fit the best line within a threshold error margin and is well-suited for both linear and non-linear data, depending on the kernel used.

#### How It Works:
- SVR performs linear regression in a high-dimensional feature space using kernel tricks. The objective is to find a function that deviates from the targets by a value no greater than a specified margin ε, while being as flat as possible.
- The RBF (Radial Basis Function) kernel used here allows the SVR model to handle non-linear relationships by mapping input features into higher-dimensional spaces where a linear separator might exist.

#### Key Characteristics:
- **Flexibility in Handling Non-linearity**: The choice of kernel (e.g., linear, polynomial, RBF) determines the ability of the SVR model to handle non-linear relationships.
- **Margin of Error Tunability**: The ε-insensitive tube (margin of error) within which no penalty is given to errors offers a unique advantage in controlling what differences are considered errors.
- **Regularization**: The regularization parameter (C) can be tuned to optimize the trade-off between achieving a low error on the training data and minimizing the model complexity for better generalization.

#### Usage:
- SVR is commonly used in financial markets for predicting prices, in energy consumption for load forecasting, and in any sector requiring robustness against outliers or the capability to model non-linear phenomena effectively.

#### Evaluation:
- Performance is typically measured using the coefficient of determination (R²), which reflects the proportion of variance in the target variable explained by the independent variables in the model.
- Due to its complexity and computational cost, proper parameter tuning (kernel type, C, ε) is critical for the performance and efficiency of the model.

SVR is a powerful tool when it comes to regression tasks involving complex datasets where traditional regression methods might fall short. Its ability to model non-linear relationships and control over the error margin make it a versatile option for many predictive modeling scenarios.


In [None]:
# Initialize the Support Vector Regression (SVR) model with the RBF (Radial Basis Function) kernel
# The 'rbf' kernel is a popular choice for SVR and allows the model to handle non-linear data
model = SVR(kernel='rbf')

# Fit the SVR model on the training dataset
# This involves finding the hyperplane (or set of hyperplanes in high-dimensional space) that best fits the data
model.fit(xtrain, ytrain)

# Evaluate the model on the testing dataset
# The score method returns the coefficient of determination R^2 of the prediction,
# indicating how well the predictions match the actual values of the dependent variable
model_score = model.score(xtest, ytest)
print("Model R^2 Score:", model_score)

### RandomForest Regression Model

**RandomForest Regression** is an ensemble learning method that operates by constructing a multitude of decision trees at training time and outputting the average prediction of the individual trees. It combines the simplicity of decision trees with flexibility, resulting in higher accuracy without a significant increase in computational complexity.

#### How It Works:
- **Ensemble Method**: RandomForest builds multiple decision trees and merges them together to get a more accurate and stable prediction.
- **Training Process**: Each tree in the forest is built from a random sample of the training set, which helps in making the model robust against overfitting.
- **Prediction**: When making predictions, the RandomForest averages the predictions of the individual trees to improve accuracy and control over-fitting.

#### Key Characteristics:
- **Robustness**: Due to its method of averaging multiple trees, RandomForest is less prone to overfitting than a single decision tree.
- **Handle Large Data Sets with Higher Dimensionality**: It can handle thousands of input variables without variable deletion, which is great for cases where features are on the higher side.
- **Versatility**: Useful for both classification and regression tasks and does well on large datasets.

#### Usage:
- RandomForest is widely used across many industries including banking (for credit scoring and fraud detection), healthcare (for identifying diseases and predicting drug responses), and e-commerce (for predicting customer behavior and preferences).

#### Evaluation:
- **R² Score**: Reflects the proportion of variance in the dependent variable that is predictable from the independent variables.
- **Parameter Tuning**: Performance can often be increased by tuning parameters such as the number of trees in the forest, the number of features considered for splitting at each leaf node, etc.

RandomForest Regression is especially popular due to its ease of use, performance, and robustness, making it a preferred choice for many predictive modeling tasks.


In [None]:
# Initialize the RandomForestRegressor model with 100 trees in the forest
# n_estimators=100 specifies that the forest should consist of 100 trees
model = RandomForestRegressor(n_estimators=100)

# Fit the RandomForest model on the training dataset
# This involves building multiple decision trees during the training phase and outputting the mean prediction of the individual trees
model.fit(xtrain, ytrain)

# Evaluate the model on the testing dataset
# The score method returns the coefficient of determination R^2 of the prediction,
# which indicates how well the model explains the variation in the dependent variable from the independent variables
model_score = model.score(xtest, ytest)
print("Model R^2 Score:", model_score)

### Gradient Boosting Regression Model

**Gradient Boosting Regression** is an advanced ensemble technique that builds models sequentially, each correcting its predecessor, thus improving the predictive accuracy of the final model.

#### How It Works:
- **Sequential Modeling**: Gradient Boosting builds the model in a stage-wise fashion. It constructs new models that predict the residuals or errors of prior models and then adds these new models to the ensemble.
- **Loss Minimization**: Each new model is fitted on the remaining unexplained variance (errors) of the previous models in a greedy algorithm fashion, minimizing a loss function.

#### Key Characteristics:
- **Flexibility**: Can be used with different loss functions, making it adaptable to regression and classification problems.
- **Handling Overfitting**: The learning rate and the number of estimators are crucial parameters. A smaller learning rate and more estimators can lead to better performance, but at the risk of overfitting if not tuned properly.
- **Feature Importance**: Gradient Boosting can identify important features, making it valuable for feature selection.

#### Usage:
- Widely used in industries like finance for risk assessment and predictive modeling, in marketing for customer lifetime value predictions, and in medical fields for predictive diagnosis.
- Often used in machine learning competitions due to its effectiveness in handling varied types of data and its ability to produce highly accurate models.

#### Evaluation:
- **R² Score**: Used to measure the proportion of variance in the dependent variable that is predictable from the independent variables.
- **Model Tuning**: Parameters like the number of trees, depth of trees, learning rate, and type of loss function greatly influence the performance and need careful tuning.

Gradient Boosting Regression is favored for its performance and predictive accuracy across a wide range of data types and is considered one of the most powerful techniques available for predictive modeling.


In [None]:
# Initialize the Gradient Boosting Regressor model with specific settings
# n_estimators=100 specifies that the model should use 100 boosting stages to produce a robust prediction model
# learning_rate=0.1 controls the contribution of each tree to the final outcome and can be used to fine-tune the performance of the model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)

# Fit the Gradient Boosting Regressor model on the training dataset
# This involves sequentially adding models to correct the residuals of prior models
model.fit(xtrain, ytrain)

# Evaluate the model on the testing dataset
# The score method returns the coefficient of determination R^2 of the prediction,
# indicating how well the model performs on unseen data
model_score = model.score(xtest, ytest)
print("Model R^2 Score:", model_score)

In [None]:
# Features = [['Likes','Saves', 'Comments', 'Shares', 'Profile Visits', 'Follows']]
features = np.array([[282.0, 233.0, 4.0, 9.0, 165.0, 54.0]])
model.predict(features)

In [None]:
# Select features and targets from the dataset
X = data[['Impressions', 'From Home', 'From Hashtags', 'From Explore',
          'From Other', 'Saves', 'Profile Visits', 'Follows', 'Caption', 'Hashtags']]
y = data[['Likes', 'Shares', 'Comments']]

# Handling categorical data: Encoding 'Caption' and 'Hashtags'
# Define which features are categorical
categorical_features = ['Caption', 'Hashtags']
# Setup the transformer for encoding: OneHotEncoder will convert categorical variables into a form that could be provided
# to ML algorithms to do a better job in prediction
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Setup the preprocessor with ColumnTransformer to apply transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)  # Apply OneHotEncoder to categorical features
    ], remainder='passthrough')  # Apply no transformation to the remaining features

# Split the data into training and testing sets with 80% of the data used for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a machine learning pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # First, preprocess the data
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # Then apply the RandomForest regressor
])

# Fit the model on the training data
model.fit(X_train, y_train)

# Evaluate the model by checking its score on the training set and test set
training_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Training score:", training_score)
print("Test score:", test_score)

# Make predictions with the test data
predictions = model.predict(X_test)
print(predictions)



In [None]:
# Calculate and print the metrics for each target variable ('Likes', 'Shares', 'Comments')
# Loop over each target variable by using enumerate to access both the index and the name of the target
for i, target in enumerate(['Likes', 'Shares', 'Comments']):
    # Calculate the Mean Squared Error (MSE) between the true and predicted values for each target
    mse = mean_squared_error(y_test.iloc[:, i], predictions[:, i])
    # Calculate the R-squared (R²) score between the true and predicted values for each target
    r2 = r2_score(y_test.iloc[:, i], predictions[:, i])
    # Print the results in a formatted string
    print(f"{target} - MSE: {mse}, R²: {r2}")
