### Tasks:
- As a social media platform operator offering various features such as posts, messaging, and
recommendations, we aim to improve user engagement and optimize personalized content delivery.
- To achieve this, we seek to analyze and categorize user behavior patterns based on their activity on the
platform.

In [1]:
# Importing Libraries:
# Data Processing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import LabelEncoder
    
# Visualization
import seaborn as sns # provide statistical graphs
import plotly.graph_objects as go
   
# We can suppress the warnings messages that may appear when we run our code.
import warnings
warnings.filterwarnings('ignore') 
matplotlib inline


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\cleli\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\cleli\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\cleli\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\cleli\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
    app.start()
  File "C:\Users\cleli\anaconda3\lib\site-pack

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\cleli\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\cleli\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\cleli\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\cleli\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
    app.start()
  File "C:\Users\cleli\anaconda3\lib\site-pack

AttributeError: _ARRAY_API not found

In [None]:
# Import data
df = pd.read_csv('Instagram_users-datetime-posts-data.csv')
df.head()

In [None]:
# check the category of each column.
# check which columns contains categorical variables (object) and numerical variables (int64, float64).
df.info()

### Exploratory data analysis (EDA)

In [None]:
# calculate the number of unique values in each column.
df.nunique(axis = 0)

In [None]:
# drop columns that are irrelevant.
df.drop(['User uuid', 'Likes', 'Days passed from post', 'Likes Score', 'Numer of Tags', 'Numer of Comments', 'Year'],
            axis=1, inplace=True)

In [None]:
# check the first 5 rows.
df.head(5)

### Missing Values

In [None]:
# display the missing values
df.isnull().sum()

In [None]:
# count the number of occurrences of unique values in the 'Type' column
df['Type'].value_counts()

In [None]:
# Count the number of occurrences\n",
type_counts = df['Type'].value_counts()

# Create a new DataFrame to combine counts
type_summary = pd.DataFrame({'Count': type_counts})
    
# List of colors (one for each bar)
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']  # Example color list
    
# Create an interactive bar chart with Plotly
fig = go.Figure()
    
# Add the bars with different colors
fig.add_trace(go.Bar(
x=type_summary.index,
y=type_summary['Count'],
marker=dict(color=colors)  # Assign colors from the list
    ))
    
# Update layout to make it more interactive
fig.update_layout(
title='Distribution of Users by Type'
xaxis_title='Type'
yaxis_title='Count',
plot_bgcolor='grey',  # Set background to grey
paper_bgcolor='grey',  # Set paper background to grey
font=dict(color='white'),  # Set font color to white for contrast
hovermode='closest',  # Enable hover
xaxis=dict(tickangle=45),  # Rotate x-axis labels for better readability
height=400,  # Adjust height of the plot
width=600,  # Adjust width of the plot
    
# Show the interactive plot
fig.show()

In [None]:
# convert the 'Date Posted' column to the datetime64[ns] format to performing specific types of analysis.
df['Date Posted'] = df['Date Posted'].astype('datetime64[ns]')

In [None]:
# count the number of occurrences of unique values in the 'Date Posted' column.
df['Date Posted'].value_counts()

In [None]:
# set the 'Date Posted' column as the index label (making that column the primary reference point).
df.set_index('Date Posted', inplace=True)

### Outliers

In [None]:
import plotly.graph_objects as go

# Selecting numerical columns for outlier detection
numeric_columns = ['Month', 'Day', 'Hour', 'Minute']

# Define custom colors for the boxes
box_colors = {
    'Month': 'lightblue',
    'Day': 'lightgreen',
    'Hour': 'lightcoral',
    'Minute': 'lightsalmon'
}

# Create a figure for the box plots
fig = go.Figure()

# Add box plots for each numeric column
for column in numeric_columns:
    fig.add_trace(go.Box(
        y=df[column],
        name=column,
        boxmean='sd',  # Add a mean and standard deviation for better visibility
        fillcolor=box_colors.get(column, 'lightgray'),  # Set box fill color from dictionary
        line=dict(color='white'),  # Set outline color for the box
        marker=dict(color='black'),  # Set color for the outlier markers
        boxpoints='outliers',  # Show only outliers, not all points
        jitter=0.5,  # Spread out the outliers slightly for clarity
        pointpos=0,  # Position of the outliers on the x-axis
    ))

# Update layout for a simple, interactive plot
fig.update_layout(
    title='Box Plots for Outlier Detection',
    xaxis_title='Columns',
    yaxis_title='Values',
    plot_bgcolor='grey',  # Set background color to grey
    paper_bgcolor='grey',  # Set paper background to grey
    font=dict(color='white'),  # Set font color to white for contrast
    height=400,  # Adjust height of the plot
    width=600,  # Adjust width of the plot
    showlegend=False  # Disable legend to make it simpler
)

# Show the interactive plot
fig.show()

### Descriptive Statistics

In [None]:
    "# check the size of tha DataFrame (rows and columns).\n",
    "df.shape"


In [None]:
    "# compute and interpret the mean, median, quartiles and standard deviation of the dataset.\n",
    "df.describe().round(2)"


### Normal Distribution (Kurtosis Test)

In [None]:
    "# List of columns to check for kurtosis\n",
    "columns_to_test = ['Month', 'Day', 'Hour', 'Minute']\n",
    "\n",
    "# Create subplots (1 row and 4 columns)\n",
    "fig, axes = plt.subplots(1, 4, figsize=(20, 6))\n",
    "\n",
    "# Initialize lists to store kurtosis values and column names\n",
    "kurtosis_values = []\n",
    "\n",
    "# Loop over each column to test\n",
    "for i, column in enumerate(columns_to_test):\n",
    "    # Calculate kurtosis for the current column\n",
    "    kurtosis_result = stats.kurtosis(df[column], nan_policy='omit')\n",
    "    kurtosis_values.append(kurtosis_result)\n",
    "    \n",
    "    # Output kurtosis result and interpretation\n",
    "    print(f\"Kurtosis Test for '{column}':\")\n",
    "    print(f\"Kurtosis statistic: {kurtosis_result}\")\n",
    "\n",
    "    if abs(kurtosis_result) < 0.05:  # This threshold can be adjusted\n",
    "        print(f\"The data in column '{column}' follows a normal distribution.\")\n",
    "    else:\n",
    "        print(f\"The data in column '{column}' does not follow a normal distribution.\")\n",
    "    \n",
    "    print(\"-\" * 50)\n",
    "    \n",
    "    # Plot the histogram and normal distribution curve\n",
    "    column_data = df[column]\n",
    "    \n",
    "    # Calculate mean and standard deviation for the normal distribution curve\n",
    "    mean = np.mean(column_data)\n",
    "    std_dev = np.std(column_data)\n",
    "    \n",
    "    # Create a range of values for the x-axis (within 3 standard deviations)\n",
    "    x = np.linspace(mean - 3 * std_dev, mean + 3 * std_dev, 100)\n",
    "    \n",
    "    # Calculate the normal distribution (PDF) using the mean and standard deviation\n",
    "    pdf = stats.norm.pdf(x, mean, std_dev)\n",
    "    \n",
    "    # Plot the histogram and normal distribution curve\n",
    "    axes[i].hist(column_data, bins=20, density=True, alpha=0.6, color='blue', label='Histogram')\n",
    "    axes[i].plot(x, pdf, color='red', linestyle='--', label='Normal Distribution')\n",
    "    axes[i].set_title(f'Kurtosis: {kurtosis_result:.2f}\\n{column}')\n",
    "    axes[i].set_xlabel(column)\n",
    "    axes[i].set_ylabel('Density')\n",
    "    axes[i].legend()\n",
    "\n",
    "# Adjust layout for better readability\n",
    "plt.tight_layout()\n",
    "plt.show()"


### Feature Engineering

In [None]:
# Assuming df is your DataFrame and 'Date Posted' is your index as datetime
df['Date Posted'] = pd.to_datetime(df.index)  # Convert index to datetime if it's not already
   
# Extract time-based features from the 'Date Posted' column
df['Hour'] = df['Date Posted'].dt.hour  # Hour of the day (0-23)
df['Day'] = df['Date Posted'].dt.day     # Day of the month (1-31)
df['Month'] = df['Date Posted'].dt.month # Month of the year (1-12)
df['Day_of_Week'] = df['Date Posted'].dt.day_name()  # Day name (e.g., Monday, Tuesday)
    
# Print out the first few rows of the dataframe after feature engineering
print(df[['Month', 'Day', 'Hour', 'Minute', 'Day_of_Week']].head())

### Trend Analysis
- Objective: analyze and determine the variations in user activity over time, focusing on hourly, daily, and monthly patterns.

#### 1- Visualize Overall Activity Trends Over Time

In [None]:
import plotly.graph_objects as go

# Set the common layout properties for all plots
layout = dict(
    plot_bgcolor='grey',  # Set background color to grey
    paper_bgcolor='grey',  # Set paper background to grey
    font=dict(color='white'),  # Set text color to white
    hovermode='closest',
    height=300,  # Adjust the height of the plot
    width=600,  # Adjust the width of the plot
)

# Plot activity by hour of the day (line plot)
hourly_activity = df.groupby('Hour').size().reindex(range(0, 24), fill_value=0)  # Ensure every hour from 0 to 23 is included

fig_hourly = go.Figure()

fig_hourly.add_trace(go.Scatter(
    x=hourly_activity.index,
    y=hourly_activity.values,
    mode='lines+markers',
    line=dict(color='skyblue'),
    marker=dict(size=8, color='skyblue')
))

fig_hourly.update_layout(
    title='Activity Distribution by Hour of the Day',
    xaxis_title='Hour of the Day',
    yaxis_title='Number of Posts',
    xaxis=dict(
        tickmode='linear', 
        tick0=0, 
        dtick=1,
        tickangle=45  # Rotate x-axis labels by 45 degrees
    )  
)

fig_hourly.update_layout(layout)  # Apply common layout properties
fig_hourly.show()

# Plot activity by day of the week (line plot)
activity_by_day = df.groupby('Day_of_Week').size().reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

fig_day_of_week = go.Figure()

fig_day_of_week.add_trace(go.Scatter(
    x=activity_by_day.index,
    y=activity_by_day.values,
    mode='lines+markers',
    line=dict(color='salmon'),
    marker=dict(size=8, color='salmon')
))

fig_day_of_week.update_layout(
    title='Activity Distribution by Day of the Week',
    xaxis_title='Day of the Week',
    yaxis_title='Number of Posts',
    xaxis=dict(
        tickmode='array',
        tickvals=[0, 1, 2, 3, 4, 5, 6],
        ticktext=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
        tickangle=45  # Rotate x-axis labels by 45 degrees
    )
)

fig_day_of_week.update_layout(layout)  # Apply common layout properties
fig_day_of_week.show()

# Plot activity by month (line plot)
activity_by_month = df.groupby('Month').size()

fig_monthly = go.Figure()

fig_monthly.add_trace(go.Scatter(
    x=activity_by_month.index,
    y=activity_by_month.values,
    mode='lines+markers',
    line=dict(color='lightgreen'),
    marker=dict(size=8, color='lightgreen')
))

fig_monthly.update_layout(
    title='Activity Distribution by Month',
    xaxis_title='Month',
    yaxis_title='Number of Posts',
    xaxis=dict(
        tickmode='array',  
        tickvals=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],  # Months 1 to 12
        ticktext=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'],
        tickangle=45  # Rotate x-axis labels by 45 degrees
    )
)

fig_monthly.update_layout(layout)  # Apply common layout properties
fig_monthly.show()

#### 2- Identify Peak Periods of Activity using the Multivariate analysis (Correlation between Features)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Identify peak activity periods (for hour of the day and day of the week)

# Peak activity by hour (find the hour with the highest activity)
peak_hour = df.groupby('Hour').size().idxmax()
print(f"The peak hour of activity is: {peak_hour} o'clock")

# Peak activity by day of the week (find the day with the highest activity)
peak_day_of_week = df.groupby('Day_of_Week').size().idxmax()
print(f"The peak day of the week for activity is: {peak_day_of_week}")

# Optionally, plot a heatmap of activity over the day of the week and hour of the day
activity_heatmap = df.groupby(['Day_of_Week', 'Hour']).size().unstack().fillna(0)

# Set figure size and create the heatmap
plt.figure(figsize=(10, 6))

# Change the color set for the heatmap
sns.heatmap(activity_heatmap, cmap="magma", annot=True, fmt="d", cbar_kws={'label': 'Number of Posts'})

# Titles and labels
plt.title('Heatmap of Activity by Day of the Week and Hour of the Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Day of the Week')

# Show the plot
plt.show()

In [None]:
# Identify peak activity periods (for hour of the day and day of the week)

# Peak activity by hour (find the hour with the highest activity)
peak_hour = df.groupby('Hour').size().idxmax()
print(f"The peak hour of activity is: {peak_hour} o'clock")

# Peak activity by day of the week (find the day with the highest activity)
peak_day_of_week = df.groupby('Day_of_Week').size().idxmax()
print(f"The peak day of the week for activity is: {peak_day_of_week}")

# Optionally, plot a heatmap of activity over the day of the week and hour of the day
activity_heatmap = df.groupby(['Day_of_Week', 'Hour']).size().unstack().fillna(0)

plt.figure(figsize=(10, 6))
sns.heatmap(activity_heatmap, cmap="YlGnBu", annot=True, fmt="d")
plt.title('Heatmap of Activity by Day of the Week and Hour of the Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Day of the Week')
plt.show()

### Encode Categorical Data

In [None]:
    "# Check the shape of your DataFrame\n",
    "print(\"Original DataFrame shape:\", df.shape)\n",
    "\n",
    "# Map 'Day_of_Week' to 'Weekday' and 'Weekend'\n",
    "weekday_to_weekend = {\n",
    "    'Monday': 'Weekday', 'Tuesday': 'Weekday', 'Wednesday': 'Weekday', 'Thursday': 'Weekday', 'Friday': 'Weekday',\n",
    "    'Saturday': 'Weekend', 'Sunday': 'Weekend'\n",
    "}\n",
    "\n",
    "# Create a new column 'Day_Type' that classifies days as 'Weekday' or 'Weekend'\n",
    "df['Day_Type'] = df['Day_of_Week'].map(weekday_to_weekend)\n",
    "\n",
    "# Apply LabelEncoder to the 'Day_Type' column to get numeric encoding\n",
    "label_encoder = LabelEncoder()\n",
    "df['Day_Type_encoded'] = label_encoder.fit_transform(df['Day_Type'])\n",
    "\n",
    "# Step 6: Verify the DataFrame's shape remains the same\n",
    "print(\"New DataFrame shape:\", df.shape)\n",
    "\n",
    "# Display the first few rows to confirm the encoding\n",
    "print(df[['Day_of_Week', 'Day_Type', 'Day_Type_encoded']].head())"


### Descriptive Statistics

In [None]:
"# Compute basic statistics for weekdays and weekends\n",
    "weekday_data = df[df['Day_Type_encoded'] == 0]  # Weekdays\n",
    "weekend_data = df[df['Day_Type_encoded'] == 1]  # Weekends\n",
    "\n",
    "# Get the count of posts per day of the week for weekdays and weekends\n",
    "weekday_counts = weekday_data['Day_of_Week'].value_counts()\n",
    "weekend_counts = weekend_data['Day_of_Week'].value_counts()\n",
    "\n",
    "# Get average hour of activity for weekdays and weekends\n",
    "weekday_avg_hour = weekday_data['Hour'].mean()\n",
    "weekend_avg_hour = weekend_data['Hour'].mean()\n",
    "\n",
    "print(\"Weekday Counts:\")\n",
    "print(weekday_counts)\n",
    "\n",
    "print(\"\\nWeekend Counts:\")\n",
    "print(weekend_counts)\n",
    "\n",
    "print(f\"\\nAverage Hour for Weekdays: {weekday_avg_hour}\")\n",
    "print(f\"Average Hour for Weekends: {weekend_avg_hour}\")


### Time Series Decomposition

In [None]:
    "import statsmodels.api as sm\n",
    "\n",
    "# Convert the 'Date Posted' to datetime if it's not already\n",
    "df['Date Posted'] = pd.to_datetime(df['Date Posted'])\n",
    "\n",
    "# Set Date Posted as the index\n",
    "df.set_index('Date Posted', inplace=True)\n",
    "\n",
    "# Resample the data to daily frequency and get the count of posts\n",
    "daily_posts = df.resample('D').size()\n",
    "\n",
    "# Decompose the time series\n",
    "decomposition = sm.tsa.seasonal_decompose(daily_posts, model='additive', period=365)\n",
    "\n",
    "# Plot the decomposition\n",
    "decomposition.plot()\n",
    "plt.show()

### Visualization

In [None]:
import seaborn as sns\n",
    "\n",
    "# Boxplot to visualize the distribution of activity by Hour and Day_Type_encoded\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.boxplot(data=df, x='Day_Type_encoded', y='Hour', palette='coolwarm')\n",
    "\n",
    "# Customize plot\n",
    "plt.title('Activity Distribution by Hour and Day Type (Weekday vs Weekend)')\n",
    "plt.xlabel('Day Type')\n",
    "plt.ylabel('Hour of the Day')\n",
    "plt.xticks([0, 1], ['Weekday', 'Weekend'])\n",
    "\n",
    "# Show the plot\n",
    "plt.show()\n"


In [None]:
"### Classification"

In [None]:
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "# Features and target\n",
    "X = df[['Hour']]  # Use Hour as a feature\n",
    "y = df['Day_Type_encoded']  # Target: Weekday (0) or Weekend (1)\n",
    "\n",
   


In [None]:
    "#### 1- Prepare Data for Classification"

#### 2- Split Data into Train and Test Sets

In [None]:
 "# Split the data into train and test sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
    "\n",
    "# Initialize and train the model\n",
    "model = LogisticRegression()\n",
    "model.fit(X_train, y_train)\n",
    "\n",
    "# Predict the test set\n",
    "y_pred = model.predict(X_test)\n",
    "\n",
    "# Evaluate the model\n",
    "print(classification_report(y_test, y_pred))\n"

#### 3- Build and Train the Classifier (e.g., Logistic Regression)

#### 4- Evaluate Model Performance

### Normalize the data using Min-Max scaling (which transforms the data to a range between 0 and 1)

In [None]:
"from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "# Initialize the scaler\n",
    "scaler = MinMaxScaler()\n",
    "\n",
    "# Normalize the selected columns\n",
    "df[['Month', 'Day', 'Hour', 'Minute']] = scaler.fit_transform(df[['Month', 'Day', 'Hour', 'Minute']])\n",
    "\n",
    "# Print the normalized data\n",
    "print(df[['Month', 'Day', 'Hour', 'Minute']].head().round(2))


### Clustering

### Pattern Recognition

#### 1- Apply Association Rule Mining or Sequence Mining

### Reference:
https://www.kaggle.com/datasets/vasileiosmpletsos/1100-instagram-users-datetime-posts-data
