In [None]:
import json
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from langdetect import detect

In [None]:
# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/xiang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/xiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/xiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Load JSON data from files
json_data = []
file_names = ['data.json', 'data_1.json', 'data_2.json', 'data_3.json', 'data_4.json', 'data_5.json', 'data_6.json'] # Games, Art, Sports

for file_name in file_names:
    with open(file_name, 'r') as file:
        json_data.append(json.load(file))


In [None]:
# Create lists to store unprocessed data
categories = []
titles = []
descriptions = []

for data in json_data:
    for category, videos in data.items():
        for title, description in zip(videos.get('titles', []), videos.get('descriptions', [])):
            # Check if both title and description are in English
            if title.strip() and description.strip():
                try:
                    if detect(title) == 'en' and detect(description) == 'en':
                        categories.append(category)
                        titles.append(title)
                        descriptions.append(description)
                except:
                    pass  # If an error occurs during language detection, skip the video

In [None]:
# Create DataFrame to store unprocessed data
df = pd.DataFrame({'category': categories, 'title': titles, 'description': descriptions})
df

Unnamed: 0,category,title,description
0,Games,Naruto Shippuden Ultimate Ninja Storm 3 - Anbu...,Naruto Shippuden Ultimate Ninja Storm 3 - Anbu...
1,Games,"ICT SCHOOL , monster ict , monster ,Monsters, ...","Monsters, Inc"
2,Games,Sonic the Hedgehog 2006- Flame Core (Cavern) w...,"At the second part of Flame Core, I managed to..."
3,Games,SporeHero Walkthrough part 47,I know that it isn't the best quality out ther...
4,Games,MW3: Ten Hidden Teddy Bear Locations- Easter Egg,Today's video shows the locations of the Teddy...
...,...,...,...
29682,Arts & Entertainment,Samsung Galaxy S4: 14 things to Deal With Over...,Here are 14 things you can do when your Samsun...
29683,Arts & Entertainment,Speed Drawing - mickey mouse (how to draw Mic...,ⓈⓊⒷⓈⒸⓇⒾⒷⒺ my channel\nIf you are a business/co...
29684,Arts & Entertainment,How To Get A Colored Clan Tag Without 13th Pre...,http://youtube.com/dcskill told itz fatal modz...
29685,Arts & Entertainment,WoT: KV-2 The King of Derp,Rockin the KV-2's Derp Gun.


In [None]:
# Define preprocessing functions
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove URLs starting with http://, https://, or www.
    text = re.sub(r'[\n\r]', ' ', text)
    text = re.sub(r'http\S+|www[.;:,-_]\S+', ' ', text, flags=re.IGNORECASE)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [None]:
# Apply preprocessing to titles and descriptions
df['title'] = df['title'].apply(preprocess_text)
df['description'] = df['description'].apply(preprocess_text)
df

Unnamed: 0,category,title,description
0,Games,naruto shippuden ultimate ninja storm anbu ita...,naruto shippuden ultimate ninja storm anbu ita...
1,Games,ict school monster ict monster monster inc toy,monster inc
2,Games,sonic hedgehog flame core cavern knuckle,second part flame core managed finish level kn...
3,Games,sporehero walkthrough part,know best quality recorded alexisspawn house s...
4,Games,mw ten hidden teddy bear location easter egg,today video show location teddy bear need spec...
...,...,...,...
29682,Arts & Entertainment,samsung galaxy thing deal overheated phone get...,thing samsung galaxy getting hot need cool sam...
29683,Arts & Entertainment,speed drawing mickey mouse draw mickey mouse,channel business company would like contact re...
29684,Arts & Entertainment,get colored clan tag without th prestige tutor...,told itz fatal modz told sub go xbox live lobb...
29685,Arts & Entertainment,wot kv king derp,rockin kv derp gun


In [None]:
# Drop rows with NaN values and duplicates
df = df.drop_duplicates()
df = df.dropna()
df

Unnamed: 0,category,title,description
0,Games,naruto shippuden ultimate ninja storm anbu ita...,naruto shippuden ultimate ninja storm anbu ita...
1,Games,ict school monster ict monster monster inc toy,monster inc
2,Games,sonic hedgehog flame core cavern knuckle,second part flame core managed finish level kn...
3,Games,sporehero walkthrough part,know best quality recorded alexisspawn house s...
4,Games,mw ten hidden teddy bear location easter egg,today video show location teddy bear need spec...
...,...,...,...
29682,Arts & Entertainment,samsung galaxy thing deal overheated phone get...,thing samsung galaxy getting hot need cool sam...
29683,Arts & Entertainment,speed drawing mickey mouse draw mickey mouse,channel business company would like contact re...
29684,Arts & Entertainment,get colored clan tag without th prestige tutor...,told itz fatal modz told sub go xbox live lobb...
29685,Arts & Entertainment,wot kv king derp,rockin kv derp gun


In [None]:
# Create a mask to identify rows without empty strings
mask = ~df.eq('').any(axis=1)

# Remove rows with empty strings
df_cleaned = df[mask]
df_cleaned

Unnamed: 0,category,title,description
0,Games,naruto shippuden ultimate ninja storm anbu ita...,naruto shippuden ultimate ninja storm anbu ita...
1,Games,ict school monster ict monster monster inc toy,monster inc
2,Games,sonic hedgehog flame core cavern knuckle,second part flame core managed finish level kn...
3,Games,sporehero walkthrough part,know best quality recorded alexisspawn house s...
4,Games,mw ten hidden teddy bear location easter egg,today video show location teddy bear need spec...
...,...,...,...
29682,Arts & Entertainment,samsung galaxy thing deal overheated phone get...,thing samsung galaxy getting hot need cool sam...
29683,Arts & Entertainment,speed drawing mickey mouse draw mickey mouse,channel business company would like contact re...
29684,Arts & Entertainment,get colored clan tag without th prestige tutor...,told itz fatal modz told sub go xbox live lobb...
29685,Arts & Entertainment,wot kv king derp,rockin kv derp gun


In [None]:
df_cleaned.to_csv('preprocessed_data.csv', index=False)

In [None]:
# Count the number of data points for each category
category_counts = df_cleaned['category'].value_counts()

# Print the counts for each category
print("Number of data for each category:")
print(category_counts)


Number of data for each category:
category
Games                   10761
Arts & Entertainment    10248
Sports                   8622
Name: count, dtype: int64


In [None]:
data_final = pd.read_csv("preprocessed_data.csv")
data_final

Unnamed: 0,category,title,description
0,Games,naruto shippuden ultimate ninja storm anbu ita...,naruto shippuden ultimate ninja storm anbu ita...
1,Games,ict school monster ict monster monster inc toy,monster inc
2,Games,sonic hedgehog flame core cavern knuckle,second part flame core managed finish level kn...
3,Games,sporehero walkthrough part,know best quality recorded alexisspawn house s...
4,Games,mw ten hidden teddy bear location easter egg,today video show location teddy bear need spec...
...,...,...,...
29626,Arts & Entertainment,samsung galaxy thing deal overheated phone get...,thing samsung galaxy getting hot need cool sam...
29627,Arts & Entertainment,speed drawing mickey mouse draw mickey mouse,channel business company would like contact re...
29628,Arts & Entertainment,get colored clan tag without th prestige tutor...,told itz fatal modz told sub go xbox live lobb...
29629,Arts & Entertainment,wot kv king derp,rockin kv derp gun


In [None]:
data_final = data_final.drop_duplicates()
data_final = data_final.dropna()
data_final

Unnamed: 0,category,title,description
0,Games,naruto shippuden ultimate ninja storm anbu ita...,naruto shippuden ultimate ninja storm anbu ita...
1,Games,ict school monster ict monster monster inc toy,monster inc
2,Games,sonic hedgehog flame core cavern knuckle,second part flame core managed finish level kn...
3,Games,sporehero walkthrough part,know best quality recorded alexisspawn house s...
4,Games,mw ten hidden teddy bear location easter egg,today video show location teddy bear need spec...
...,...,...,...
29626,Arts & Entertainment,samsung galaxy thing deal overheated phone get...,thing samsung galaxy getting hot need cool sam...
29627,Arts & Entertainment,speed drawing mickey mouse draw mickey mouse,channel business company would like contact re...
29628,Arts & Entertainment,get colored clan tag without th prestige tutor...,told itz fatal modz told sub go xbox live lobb...
29629,Arts & Entertainment,wot kv king derp,rockin kv derp gun


In [None]:
mask = data_final.eq('').any(axis=1)
data_final[mask]

Unnamed: 0,category,title,description


In [None]:
mask = data_final.isnull().any(axis=1)
data_final[mask]

Unnamed: 0,category,title,description
