In [1]:
%reload_ext lab_black

In [2]:
import pandas as pd
from nltk import tokenize
import datetime
import matplotlib.pylab as plt
import yfinance as yf
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from numba import jit, prange


# Authors: Christopher Oblak,
# Date Intialized: 23FEB2021
# Project: YouTube Content-Based Stock Predictor
# Task: Input, cleaning, and feature building of youtube data that has been continually collected.

# The following code is designed to be ran at the bash terminal with a file name.  It will injest a give data file (in this case a ".gzip" file),
# and resave cleaning date that has additional feautres build into and cleaned caption and titles, and sentiment scores.

In [3]:
# input data
nvidia_df = pd.read_csv("../00_data/nvidia_caption_data_21FEB.gz")

In [4]:
# Cleaning step1: fix the issue were the data was appended incorrectly
correct_df = nvidia_df.iloc[0:16268].copy()
incorrect_df = nvidia_df.iloc[16268:].copy()

# relable incoorect columns to be appended back correctly
incorrect_df.rename(
    columns={
        "captionString": "drop",
        "Unnamed: 0": "Index",
        "Unnamed: 0.1": "videoID",
        "videoID": "datePub",
        "datePub": "searchedDate",
        "searchedDate": "VideoTitle",
        "VideoTitle": "channelTitle",
        "channelTitle": "viewCount",
        "viewCount": "likeCount",
        "likeCount": "dislikeCount",
        "dislikeCount": "captionString",
    },
    inplace=True,
)

# drop the exccess column created by AWS appending
incorrect_df = incorrect_df.drop(columns=["drop"])

# relable columns to match the corrected df
correct_df.rename(columns={"Unnamed: 0.1": "Index"}, inplace=True)
correct_df = correct_df.drop(columns=["Unnamed: 0"])

# create new nvida data frame to begin cleaning and working with
nvidia_df_1 = correct_df.append(incorrect_df, sort=False)

# initialize correct types of data for each column
nvidia_df_1["VideoTitle"] = nvidia_df_1["VideoTitle"].astype("str")
nvidia_df_1["datePub"] = pd.to_datetime(
    nvidia_df_1["datePub"], format="%Y-%m-%d %H:%M:%S", utc=True
)
nvidia_df_1["searchedDate"] = pd.to_datetime(
    nvidia_df_1["searchedDate"], format="%Y-%m-%d %H:%M:%S", utc=True
)
nvidia_df_1["channelTitle"] = nvidia_df_1["channelTitle"].astype("str")
nvidia_df_1["viewCount"] = nvidia_df_1["viewCount"].astype("int")
nvidia_df_1["likeCount"] = nvidia_df_1["likeCount"].astype("int")
nvidia_df_1["dislikeCount"] = nvidia_df_1["dislikeCount"].astype("int")
nvidia_df_1["captionString"] = nvidia_df_1["captionString"].astype("str")

# define a cleaning function to clean and standardize caption strings
def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("\\[[^][]*]", "", raw)
    result = re.sub("<[a][^>]*>(.+?)</[a]>", "Link.", result)
    result = re.sub("&gt;", "", result)
    result = re.sub("&#x27;", "'", result)
    result = re.sub("&quot;", '"', result)
    result = re.sub("&#x2F;", " ", result)
    result = re.sub("<p>", " ", result)
    result = re.sub("</i>", "", result)
    result = re.sub("&#62;", "", result)
    result = re.sub("<i>", " ", result)
    result = re.sub("\n", "", result)
    result = re.sub("     ", "", result)
    return result


nvidia_df_1["captionString"] = nvidia_df_1["captionString"].apply(clean)
nvidia_df_1["VideoTitle"] = nvidia_df_1["VideoTitle"].apply(clean)

# for i in range(len(nvidia_df_1["captionString"])):
#    nvidia_df_1["captionString"][i] = clean(nvidia_df_1["captionString"][i])

In [7]:
nvidia_df_2 = nvidia_df_1.copy()
nvidia_df_2["VideoTitle"] = nvidia_df_2["VideoTitle"].apply(clean)

In [14]:
nvidia_df_2[nvidia_df_2["channelTitle"] == "JayzTwoCents"].head()

Unnamed: 0,Index,videoID,datePub,searchedDate,VideoTitle,channelTitle,viewCount,likeCount,dislikeCount,captionString
3,3,nMns8t4OhI0,2020-09-30 23:00:06+00:00,2020-10-02 01:45:23.922084+00:00,NVIDIA Responds to RTX 3080 Crashes...,JayzTwoCents,561238,27766,885,
9,9,zxJDdRgs7Lw,2020-09-10 13:00:28+00:00,2020-10-02 01:45:28.639529+00:00,We finally have our hands on the NVIDIA RTX 3080!,JayzTwoCents,2349784,104987,4459,well today's the day out with the old and and ...
22,8,zxJDdRgs7Lw,2020-09-10 13:00:28+00:00,2020-10-02 17:20:46.083212+00:00,We finally have our hands on the NVIDIA RTX 3080!,JayzTwoCents,2350121,104992,4459,well today's the day out with the old and and ...
46,7,zxJDdRgs7Lw,2020-09-10 13:00:28+00:00,2020-10-02 18:00:14.745652+00:00,We finally have our hands on the NVIDIA RTX 3080!,JayzTwoCents,2350136,104992,4459,well today's the day out with the old and and ...
71,7,zxJDdRgs7Lw,2020-09-10 13:00:28+00:00,2020-10-02 18:03:09.902661+00:00,We finally have our hands on the NVIDIA RTX 3080!,JayzTwoCents,2350136,104992,4459,well today's the day out with the old and and ...


In [13]:
nvidia_df.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,videoID,datePub,searchedDate,VideoTitle,channelTitle,viewCount,likeCount,dislikeCount,captionString
0,0,0,y4fb_R5Ogw0,2020-10-01T05:16:02Z,2020-10-02 01:45:21.553469,"2x NVIDIA RTX 3090 SLI Benchmarks: 500FPS, 700...",Gamers Nexus,367472,15701,260,so this setup that you're seeing is what we ha...
1,1,1,ALEXVtnNEwA,2020-09-01T17:15:57Z,2020-10-02 01:45:22.351801,NVIDIA GeForce Special Event Livestream,IGN,543115,16860,411,b [Music] [Music] [Applause] [Music] [Music] w...
2,2,2,E98hC9e__Xs,2020-09-01T16:42:07Z,2020-10-02 01:45:23.252012,NVIDIA GeForce RTX 30 Series | Official Launch...,NVIDIA GeForce,1484075,54741,1135,Welcome to my kitchen. I hope all of you are...
3,3,3,nMns8t4OhI0,2020-09-30T23:00:06Z,2020-10-02 01:45:23.922084,NVIDIA Responds to RTX 3080 Crashes...,JayzTwoCents,561238,27766,885,[]
4,4,4,AG_ZHi3tuyk,2020-09-16T12:59:59Z,2020-10-02 01:45:24.592244,"Nvidia, you PROMISED! - RTX 3080 Review",Linus Tech Tips,3391565,141852,3961,- The RTX 3080 is the most powerful GPU we've ...


In [15]:
nvidia_df_2.to_csv("../00_data/sentiment_test.gz", compression="gzip")

## The below code is to be copied to a separate .py file for seamless data cleaning and execution.

In [None]:
import pandas as pd
from nltk import tokenize
from datetime import datetime
import matplotlib.pylab as plt
import yfinance as yf
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from numba import jit, prange

# define a cleaning function to clean and standardize caption strings
def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("\\[[^][]*]", "", raw)
    result = re.sub("<[a][^>]*>(.+?)</[a]>", "Link.", result)
    result = re.sub("&gt;", "", result)
    result = re.sub("&#x27;", "'", result)
    result = re.sub("&quot;", '"', result)
    result = re.sub("&#x2F;", " ", result)
    result = re.sub("<p>", " ", result)
    result = re.sub("</i>", "", result)
    result = re.sub("&#62;", "", result)
    result = re.sub("<i>", " ", result)
    result = re.sub("\n", "", result)
    result = re.sub("     ", "", result)
    return result

def clean_nvidia_df(df):
    # input data
    nvidia_df = pd.read_csv(df)
    
    # Cleaning step1: fix the issue were the data was appended incorrectly
    correct_df = nvidia_df.iloc[0:16268].copy()
    incorrect_df = nvidia_df.iloc[16268:].copy()

    # relable incoorect columns to be appended back correctly
    incorrect_df.rename(
        columns={
            "captionString": "drop",
            "Unnamed: 0": "Index",
            "Unnamed: 0.1": "videoID",
            "videoID": "datePub",
            "datePub": "searchedDate",
            "searchedDate": "VideoTitle",
            "VideoTitle": "channelTitle",
            "channelTitle": "viewCount",
            "viewCount": "likeCount",
            "likeCount": "dislikeCount",
            "dislikeCount": "captionString",
        },
        inplace=True,
    )

    # drop the exccess column created by AWS appending
    incorrect_df = incorrect_df.drop(columns=["drop"])

    #relable columns to match the corrected df
    correct_df.rename(columns={"Unnamed: 0.1": "Index"}, inplace=True)
    correct_df = correct_df.drop(columns=["Unnamed: 0"])

    #create new nvida data frame to begin cleaning and working with
    nvidia_df_1 = correct_df.append(incorrect_df, sort=False)
    
    # initialize correct types of data for each column
    nvidia_df_1["VideoTitle"] = nvidia_df_1["VideoTitle"].astype("str")
    nvidia_df_1["datePub"] = pd.to_datetime(
        nvidia_df_1["datePub"], format="%Y-%m-%d %H:%M:%S", utc=True
    )
    nvidia_df_1["searchedDate"] = pd.to_datetime(
        nvidia_df_1["searchedDate"], format="%Y-%m-%d %H:%M:%S", utc=True
    )
    nvidia_df_1["channelTitle"] = nvidia_df_1["channelTitle"].astype("str")
    nvidia_df_1["viewCount"] = nvidia_df_1["viewCount"].astype("int")
    nvidia_df_1["likeCount"] = nvidia_df_1["likeCount"].astype("int")
    nvidia_df_1["dislikeCount"] = nvidia_df_1["dislikeCount"].astype("int")
    
    nvidia_df_1["captionString"] = nvidia_df_1["captionString"].apply(clean)
    nvidia_df_1["VideoTitle"] = nvidia_df_1["VideoTitle"].apply(clean)
    
    return nvidia_df_1
    
def main(df = "../00_data/nvidia_caption_data_21FEB.gz"):
    cleaned_nvidia_df_prefeatures = clean_nvidia_df(df)
    print(cleaned_nvidia_df_prefeatures.head())

if __name__ == "__main__": 
    import sys
    df = str(sys.argv[1])
    print(df.head())
    main(df)
    print(cleaned_df.head())