# Trend Youtube

In [81]:
import numpy as np
import pandas as pd
import re
import matplotlib as mpl
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import col, mode, mean, median
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [82]:
# spark = SparkSession.builder.appName("Youtube Trend").getOrCreate()

df1 = pd.read_csv('youtube1.csv')
df2 = pd.read_csv('youtube2.csv')
df3 = pd.read_csv('youtube3.csv')
df4 = pd.read_csv('youtube4.csv')
# df5 = pd.read_csv('youtube5.csv')

df = pd.concat(
    map(pd.read_csv, ['youtube1.csv', 'youtube2.csv', 'youtube3.csv', 'youtube4.csv']), 
    ignore_index=True)
print(df)

                                                 Title  \
0    Call of Duty League Major III Tournament | Cha...   
1     I Tried Every Seat on the Most Expensive Airline   
2    Inside the NBA Reacts To Timberwolves Stunning...   
3                      I protected a LEGO PRESIDENT...   
4    #3 TIMBERWOLVES at #2 NUGGETS | FULL GAME 7 HI...   
..                                                 ...   
795  Etho Plays Minecraft - Episode 586: Chaos To O...   
796                    $30,000,000 AI Is Hiding a Scam   
797  A Boogie Wit da Hoodie - Body (feat. Cash Coba...   
798                           AU$TRALIA: A Travel Game   
799                             DRAKE - FAMILY MATTERS   

                                           Description  \
0    Welcome to the #CDL2024 Major III Tournament! ...   
1    I owed a guy a favor\nsign up for shopify for ...   
2    Watch highlights from Inside the NBA with Shaq...   
3    Submit a banger video idea here: https://forms...   
4    Never mi

In [83]:
# Null Count for the entire Dataset
print(df.isnull().sum().sum())

# Null Count for spesific column
print(df.isna().sum())

25
Title           0
Description    25
Thumbnail       0
dtype: int64


In [84]:
# Deleting Null Contains Rows and re-index entire dataset
df = df.dropna().reset_index()
df = df.drop(columns=['index'])
print(df)

                                                 Title  \
0    Call of Duty League Major III Tournament | Cha...   
1     I Tried Every Seat on the Most Expensive Airline   
2    Inside the NBA Reacts To Timberwolves Stunning...   
3                      I protected a LEGO PRESIDENT...   
4    #3 TIMBERWOLVES at #2 NUGGETS | FULL GAME 7 HI...   
..                                                 ...   
770       SIDEMEN AMONG US JESTER ROLE: JYNXZI EDITION   
771  Etho Plays Minecraft - Episode 586: Chaos To O...   
772                    $30,000,000 AI Is Hiding a Scam   
773  A Boogie Wit da Hoodie - Body (feat. Cash Coba...   
774                           AU$TRALIA: A Travel Game   

                                           Description  \
0    Welcome to the #CDL2024 Major III Tournament! ...   
1    I owed a guy a favor\nsign up for shopify for ...   
2    Watch highlights from Inside the NBA with Shaq...   
3    Submit a banger video idea here: https://forms...   
4    Never mi

In [85]:
def create_title_sentence(row):
    # Handle multiple cuisines
    title = row['Title']
    if pd.isnull(title):  # Check if cuisines is null
        title = ""
    else:
        title = ' '.join(re.split('[|,;:.!?-]', title))
        # title = ' '.join(title.split('-'))
        # title = ' '.join(title.split('|'))
        
    
    # Return the sentence
    return f"Title: {title}"

# Apply the function to create a new column 'cuisine_sentence' in chefmozcuisine
df['title_sentences'] = df.apply(create_title_sentence, axis=1)
print(df['title_sentences'])

0      Title: Call of Duty League Major III Tournamen...
1      Title: I Tried Every Seat on the Most Expensiv...
2      Title: Inside the NBA Reacts To Timberwolves S...
3                 Title: I protected a LEGO PRESIDENT   
4      Title: #3 TIMBERWOLVES at #2 NUGGETS   FULL GA...
                             ...                        
770    Title: SIDEMEN AMONG US JESTER ROLE  JYNXZI ED...
771    Title: Etho Plays Minecraft   Episode 586  Cha...
772               Title: $30 000 000 AI Is Hiding a Scam
773    Title: A Boogie Wit da Hoodie   Body (feat  Ca...
774                      Title: AU$TRALIA  A Travel Game
Name: title_sentences, Length: 775, dtype: object


In [87]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
tfidf_matrix = vectorizer.fit_transform(df['Title'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)