In [1]:
# Import packages for data manipulation
import pandas as pd
import numpy as np

# Import packages for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Import packages for statistical analysis/hypothesis testing
from scipy.stats import ttest_ind

In [2]:
# Load dataset into dataframe
df = pd.read_csv("tiktok_dataset.csv")

#### 1. Data professionals use descriptive statistics for Exploratory Data Analysis. How can computingdescriptive statistics help you learn more about your data in this stage of your analysis?  
- Descriptive statistics help us to understand our data by providing summaries of central tendency, variability, distribution shape, otlier, group comparisons and relations with data.  
- Descriptive statistics provide a foundational understanding of your dataset, revealing key insights and guiding further analysis. They help to quickly summarize and make sense of large datasets, ensuring that any subsequent modeling or hypothesis testing is based on a thorough understanding of the data's inherent characteristics.  

### Data Exploration

In [3]:
# Display first few row
df.head(10)

Unnamed: 0,#,claim_status,video_id,video_duration_sec,video_transcription_text,verified_status,author_ban_status,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
0,1,claim,7017666017,59,someone shared with me that drone deliveries a...,not verified,under review,343296.0,19425.0,241.0,1.0,0.0
1,2,claim,4014381136,32,someone shared with me that there are more mic...,not verified,active,140877.0,77355.0,19034.0,1161.0,684.0
2,3,claim,9859838091,31,someone shared with me that american industria...,not verified,active,902185.0,97690.0,2858.0,833.0,329.0
3,4,claim,1866847991,25,someone shared with me that the metro of st. p...,not verified,active,437506.0,239954.0,34812.0,1234.0,584.0
4,5,claim,7105231098,19,someone shared with me that the number of busi...,not verified,active,56167.0,34987.0,4110.0,547.0,152.0
5,6,claim,8972200955,35,someone shared with me that gross domestic pro...,not verified,under review,336647.0,175546.0,62303.0,4293.0,1857.0
6,7,claim,4958886992,16,someone shared with me that elvis presley has ...,not verified,active,750345.0,486192.0,193911.0,8616.0,5446.0
7,8,claim,2270982263,41,someone shared with me that the best selling s...,not verified,active,547532.0,1072.0,50.0,22.0,11.0
8,9,claim,5235769692,50,someone shared with me that about half of the ...,not verified,active,24819.0,10160.0,1050.0,53.0,27.0
9,10,claim,4660861094,45,someone shared with me that it would take a 50...,verified,active,931587.0,171051.0,67739.0,4104.0,2540.0


In [4]:
# Generate a table of descriptive statistics about the data
df.describe()

Unnamed: 0,#,video_id,video_duration_sec,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
count,19382.0,19382.0,19382.0,19084.0,19084.0,19084.0,19084.0,19084.0
mean,9691.5,5627454000.0,32.421732,254708.558688,84304.63603,16735.248323,1049.429627,349.312146
std,5595.245794,2536440000.0,16.229967,322893.280814,133420.546814,32036.17435,2004.299894,799.638865
min,1.0,1234959000.0,5.0,20.0,0.0,0.0,0.0,0.0
25%,4846.25,3430417000.0,18.0,4942.5,810.75,115.0,7.0,1.0
50%,9691.5,5618664000.0,32.0,9954.5,3403.5,717.0,46.0,9.0
75%,14536.75,7843960000.0,47.0,504327.0,125020.0,18222.0,1156.25,292.0
max,19382.0,9999873000.0,60.0,999817.0,657830.0,256130.0,14994.0,9599.0


In [5]:
# check for missing values
df.isnull().sum()

#                             0
claim_status                298
video_id                      0
video_duration_sec            0
video_transcription_text    298
verified_status               0
author_ban_status             0
video_view_count            298
video_like_count            298
video_share_count           298
video_download_count        298
video_comment_count         298
dtype: int64

In [6]:
# drop rows with missing values
df=df.dropna(axis=0)

In [7]:
# Display first few rows after handling missing values
df.head()

Unnamed: 0,#,claim_status,video_id,video_duration_sec,video_transcription_text,verified_status,author_ban_status,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
0,1,claim,7017666017,59,someone shared with me that drone deliveries a...,not verified,under review,343296.0,19425.0,241.0,1.0,0.0
1,2,claim,4014381136,32,someone shared with me that there are more mic...,not verified,active,140877.0,77355.0,19034.0,1161.0,684.0
2,3,claim,9859838091,31,someone shared with me that american industria...,not verified,active,902185.0,97690.0,2858.0,833.0,329.0
3,4,claim,1866847991,25,someone shared with me that the metro of st. p...,not verified,active,437506.0,239954.0,34812.0,1234.0,584.0
4,5,claim,7105231098,19,someone shared with me that the number of busi...,not verified,active,56167.0,34987.0,4110.0,547.0,152.0


You are interested in the relationship between verified_status and video_view_count. One
approach is to examine the mean value of video_view_count for each group of verified_status
in the sample data.

In [8]:
# Compute the mean `video_view_count` for each group in `verified_status`
df.groupby("verified_status")['video_view_count'].mean()

verified_status
not verified    265663.785339
verified         91439.164167
Name: video_view_count, dtype: float64

### Task 3. Hypothesis testing
Recall the difference between the null hypothesis and the alternative hypotheses. What are
your hypotheses for this data project?

- Null Hypothesis: The mean video count for verified users is equal to the mean video view count for not verified users.
- Alternative Hypothesis: The mean video count for verified users is eqla to the mean video count for not verified users.

You choose 5% as the significance level and proceed with a two-sample t-test.

In [20]:
# Conduct a two-sample t-test to compare means
# Split data into verified and not verified groups
verified_count = df[df['verified_status']=='verified']['video_view_count']
not_verified_count = df[df['verified_status']=='not verified']['video_view_count']

#perform the two-sample t-test
t_stat,p_value = ttest_ind(not_verified_count, verified_count, equal_var=True)

print(f'T-statistic: {t_stat}')
print(f'P-value: {p_value}')

T-statistic: 18.250939509545823
P-value: 8.632160883925904e-74


In [21]:
# Conduct a two-sample t-test to compare means
# Split data into verified and not verified groups
verified_count = df[df['verified_status']=='verified']['video_view_count']
not_verified_count = df[df['verified_status']=='not verified']['video_view_count']

#perform the two-sample t-test
t_stat,p_value = ttest_ind(not_verified_count, verified_count, equal_var=False)

print(f'T-statistic: {t_stat}')
print(f'P-value: {p_value}')

T-statistic: 25.499441780633777
P-value: 2.6088823687177823e-120


- Both tests indicate a significant difference in mean video view counts between verified and not_verified users.
- The choice of equal_var=True or equal_var=False affects the t-statistic and p-value, but in this case, both approaches lead to the same conclusion due to the extremely small p-values.
- Welch's t-test (equal_var=False) is more robust when the assumption of equal variances is questionable.

### Task 4. Communicate insights with stakeholders
- What business insight(s) can you draw from the result of your hypothesis test?

The results indicate that verified users attract significantly more video views compared to not_verified users. This suggests that verification status plays a crucial role in increasing visibility and engagement on the platform. For businesses, focusing on encouraging users to get verified can potentially lead to higher audience reach and engagement levels, which are beneficial for content monetization and platform growth strategies.