In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | done
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=e452587b6e8ba6e75060dfd8078b4441d6aea4546b699f3e9748bb942cd66a10
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
    .appName("Twitch Streamers Analysis") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/25 11:37:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
data_path = "/kaggle/input/top-1000-twitch-streamers-data-may-2024/datasetV2.csv"
df = spark.read.option("header", "true").csv(data_path)

                                                                                

In [5]:
df.createOrReplaceTempView("twitch_streamers")

In [6]:
df

DataFrame[RANK: string, NAME: string, LANGUAGE: string, TYPE: string, MOST_STREAMED_GAME: string, 2ND_MOST_STREAMED_GAME: string, AVERAGE_STREAM_DURATION: string, FOLLOWERS_GAINED_PER_STREAM: string, AVG_VIEWERS_PER_STREAM: string, AVG_GAMES_PER_STREAM: string, TOTAL_TIME_STREAMED: string, TOTAL_FOLLOWERS: string, TOTAL_VIEWS: string, TOTAL_GAMES_STREAMED: string, ACTIVE_DAYS_PER_WEEK: string, MOST_ACTIVE_DAY: string, DAY_WITH_MOST_FOLLOWERS_GAINED: string]

In [7]:
df.show(1)

+----+--------+--------+-----------+------------------+----------------------+-----------------------+---------------------------+----------------------+--------------------+-------------------+---------------+-----------+--------------------+--------------------+---------------+------------------------------+
|RANK|    NAME|LANGUAGE|       TYPE|MOST_STREAMED_GAME|2ND_MOST_STREAMED_GAME|AVERAGE_STREAM_DURATION|FOLLOWERS_GAINED_PER_STREAM|AVG_VIEWERS_PER_STREAM|AVG_GAMES_PER_STREAM|TOTAL_TIME_STREAMED|TOTAL_FOLLOWERS|TOTAL_VIEWS|TOTAL_GAMES_STREAMED|ACTIVE_DAYS_PER_WEEK|MOST_ACTIVE_DAY|DAY_WITH_MOST_FOLLOWERS_GAINED|
+----+--------+--------+-----------+------------------+----------------------+-----------------------+---------------------------+----------------------+--------------------+-------------------+---------------+-----------+--------------------+--------------------+---------------+------------------------------+
|   1|kaicenat| English|personality|     Just Chatting|     I'm 

In [8]:
# Calculate the average number of viewers per stream for different types of streamers

avg_viewers_query = """
    SELECT TYPE, ROUND(AVG(AVG_VIEWERS_PER_STREAM), 2) AS avg_viewers_per_stream
    FROM twitch_streamers
    GROUP BY TYPE
    ORDER BY avg_viewers_per_stream DESC
"""

In [9]:
avg_viewers_result = spark.sql(avg_viewers_query)
avg_viewers_result.show()

+-----------+----------------------+
|       TYPE|avg_viewers_per_stream|
+-----------+----------------------+
|    esports|              99411.16|
|personality|              14765.45|
+-----------+----------------------+



In [10]:
# Comparison of the popularity of streamers in various categories of games

popular_games_query = """
    SELECT MOST_STREAMED_GAME, COUNT(*) AS num_streamers
    FROM twitch_streamers
    GROUP BY MOST_STREAMED_GAME
    ORDER BY num_streamers DESC
    LIMIT 10
"""

In [11]:
popular_games_result = spark.sql(popular_games_query)
popular_games_result.show()

+------------------+-------------+
|MOST_STREAMED_GAME|num_streamers|
+------------------+-------------+
|     Just Chatting|          257|
| League of Legends|           84|
|Grand Theft Auto V|           74|
|          VALORANT|           60|
|            Casino|           36|
|            Dota 2|           35|
|          Fortnite|           35|
|    Counter-Strike|           31|
|         Minecraft|           24|
|    Virtual Casino|           23|
+------------------+-------------+



In [12]:
# Calculate the average number of subscribers per streamer depending on the language

avg_followers_per_language_query = """
    SELECT LANGUAGE, CAST(ROUND(AVG(TOTAL_FOLLOWERS)) AS INTEGER) AS avg_followers
    FROM twitch_streamers
    GROUP BY LANGUAGE
    ORDER BY avg_followers DESC
"""

In [13]:
avg_followers_per_language_result = spark.sql(avg_followers_per_language_query)
avg_followers_per_language_result.show()

+----------+-------------+
|  LANGUAGE|avg_followers|
+----------+-------------+
|   Spanish|      1776154|
|   English|      1133737|
|   Turkish|      1094714|
|Portuguese|       775177|
|    French|       766026|
|   Italian|       740931|
|    German|       735735|
|    Polish|       588500|
|    Korean|       459500|
|      Thai|       455800|
|   Russian|       442713|
|     Czech|       343333|
|  Japanese|       272054|
| Cantonese|       264500|
|   Chinese|       232975|
| Hungarian|       229000|
| Ukrainian|       205967|
|    Arabic|       160000|
|  Romanian|       141000|
+----------+-------------+



In [14]:
# Determining the most active day for streamers

most_active_day_query = """
    SELECT MOST_ACTIVE_DAY, COUNT(*) AS num_streamers
    FROM twitch_streamers
    GROUP BY MOST_ACTIVE_DAY
    ORDER BY num_streamers DESC
    LIMIT 1
"""

In [15]:
most_active_day_result = spark.sql(most_active_day_query)
most_active_day_result.show()

+---------------+-------------+
|MOST_ACTIVE_DAY|num_streamers|
+---------------+-------------+
|        Tuesday|          183|
+---------------+-------------+



In [16]:
# Analysis of time spent live on average per stream for different types of streamers

avg_stream_duration_query = """
    SELECT TYPE, ROUND(AVG(AVERAGE_STREAM_DURATION), 2) AS avg_stream_duration
    FROM twitch_streamers
    GROUP BY TYPE
    ORDER BY avg_stream_duration DESC
"""

In [17]:
avg_stream_duration_result = spark.sql(avg_stream_duration_query)
avg_stream_duration_result.show()

+-----------+-------------------+
|       TYPE|avg_stream_duration|
+-----------+-------------------+
|    esports|               7.62|
|personality|                5.9|
+-----------+-------------------+



In [18]:
spark.stop()