# TikTok Reach Analysis Project
## Data Science Analysis

**Objective:** Analyze TikTok video performance data to understand factors influencing reach and engagement.

### Research Questions:
1. **H1:** Does the time of day when a post is published significantly affect its reach?
2. **H2:** What is the relationship between likes, comments, shares and total views?
3. **H3:** Does video length correlate with higher reach and engagement?
4. **H4:** Can we accurately predict a video's reach based on temporal factors and content characteristics?

## 1. Import Libraries and Load Data

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
from scipy import stats

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import joblib

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)

print('Libraries imported successfully!')

In [None]:
# Load the dataset
df = pd.read_csv('../data/tiktok_data.csv')

print(f'Dataset loaded successfully!')
print(f'Shape: {df.shape}')
print(f'\nFirst few rows:')
df.head()

## 2. Data Exploration and Preprocessing

In [None]:
# Dataset overview
print('Dataset Information:')
print('=' * 50)
df.info()
print('\n' + '=' * 50)
print('\nMissing Values:')
print(df.isnull().sum())
print('\n' + '=' * 50)
print('\nBasic Statistics:')
df.describe()

In [None]:
# Check data types and convert if needed
df['Upload_Date'] = pd.to_datetime(df['Upload_Date'])
df['Upload_DateTime'] = pd.to_datetime(df['Upload_DateTime'])

# Create additional features
df['Is_Weekend'] = df['Upload_Day'].isin(['Saturday', 'Sunday']).astype(int)

# Categorize time of day
def categorize_time(hour):
    if 0 <= hour < 6:
        return 'Late Night'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    else:
        return 'Evening'

df['Time_Period'] = df['Upload_Hour'].apply(categorize_time)

# Categorize video length
def categorize_length(length):
    if length <= 30:
        return 'Short'
    elif length <= 90:
        return 'Medium'
    else:
        return 'Long'

df['Length_Category'] = df['Video_Length'].apply(categorize_length)

print('Feature engineering completed!')
print(f'New features added: Is_Weekend, Time_Period, Length_Category')
print(f'\nUpdated dataset shape: {df.shape}')

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Distribution of key metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['Views', 'Likes', 'Comments', 'Shares']

for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    sns.histplot(df[metric], bins=50, kde=True, ax=ax, color=sns.color_palette('husl')[idx])
    ax.set_title(f'Distribution of {metric}', fontsize=14, fontweight='bold')
    ax.set_xlabel(metric, fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)

plt.tight_layout()
plt.savefig('../visualizations/metrics_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('Engagement metrics distribution visualized!')

In [None]:
# Category performance
category_stats = df.groupby('Category')[['Views', 'Likes', 'Comments', 'Shares']].mean().round(0)
category_stats = category_stats.sort_values('Views', ascending=False)

fig = px.bar(category_stats.reset_index(), x='Category', y='Views',
             title='Average Views by Content Category',
             labels={'Views': 'Average Views', 'Category': 'Content Category'},
             color='Views', color_continuous_scale='Viridis')

fig.update_layout(font=dict(size=12), title_font_size=16)
fig.write_html('../visualizations/category_performance.html')
fig.show()

print('\nCategory Performance Summary:')
print(category_stats)