# Create A YouTube Name

## Scrape From Social Blade

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

In [2]:
bad_url = r"https://socialblade.com/youtube/top/trending/bottom-500-channels-30-days/most-unsubscribed"
good_url = r"https://socialblade.com/youtube/top/trending/top-500-channels-30-days/most-subscribed"

In [3]:
def search_style(div, style_dict):
    """Search a HTML String for a given style"""
    div_style = div.get_attribute_list("style")[0]
    if div_style:
        style_pair_list = [[j.strip() for j in i.strip().split(":")] for i in div_style.split(";") if i.strip()]
        c_div_style_dict = {pair[0]:pair[1] for pair in style_pair_list}
        for k,v in style_dict.items():
            if k not in c_div_style_dict:
                return False
            if v != c_div_style_dict[k]:
                return False
        return True
    return False

In [4]:
def get_details(row):
    """Pass row of HTML from SocialBlade Table, return Tuple of Name, Subs, Views"""
    name = row.find_all("a")[0].string
    stats = [c_div for c_div in row.find_all("div") if search_style(c_div, {"width":"150px"})]
    subs, views = [int(c_stat.text.strip().replace(",","").replace("--", "0")) for c_stat in stats]
    return name, subs, views

In [5]:
def get_socialblade(url):
    """Pass SpcialBlade URL, return pandas DataFrame of data"""
    # TargetStyle is the normal width of the SocialBlade data table
    target_style = {"width":"860px"}
    
    results = requests.get(url)
    content = results.content
    soup = bs(content, "html.parser")
    divs = soup.find_all("div")
    rows = [c_div for c_div in divs if search_style(c_div, target_style)][2:]
    table_data = [get_details(c_row) for c_row in rows]
    return pd.DataFrame(table_data, columns = ["Name", "Subs", "Views"])

## Prepare Names for NN

In [6]:
import numpy as np

In [7]:
def encode(string, valid_chars = None, max_len=36):
    """Convert a string into a matrix of one-hot encoded character vectors"""
    if not valid_chars:
        letters = "abcdefghijklmnopqrstuvwxyz"
        letters += letters.upper()
        special_chars = r" !@#$%^&*()_+-={}[]:;<,>.?/\`~'" + '"'
        numbers = "".join([str(i) for i in range(10)])
        valid_chars = letters + special_chars + numbers
    valid_char_ct = len(valid_chars)
    
    output = []
    for i in string:
        c_letter = np.zeros(valid_char_ct)
        c_letter[valid_chars.index(i)] = 1
        output.append(c_letter)
    while len(output)<max_len:
        output.append(np.zeros(valid_char_ct))
    return np.array(output)

In [8]:
def decode(input_matrix, valid_chars = None):
    """Convert a matrix of one-hot encoded character vectors into a string"""
    if not valid_chars:
        letters = "abcdefghijklmnopqrstuvwxyz"
        letters += letters.upper()
        special_chars = r" !@#$%^&*()_+-={}[]:;<,>.?/\`~'" + '"'
        numbers = "".join([str(i) for i in range(10)])
        valid_chars = letters + special_chars + numbers
    output = ""
    for i in input_matrix:
        try:
            index = np.where(i==1)[0][0]
        except IndexError:
            return output
        output += valid_chars[index]
    return output

In [9]:
def valid_chars():
    letters = "abcdefghijklmnopqrstuvwxyz"
    letters += letters.upper()
    special_chars = r" !@#$%^&*()_+-={}[]:;<,>.?/\`~'" + '"'
    numbers = "".join([str(i) for i in range(10)])
    return letters + special_chars + numbers

# Train Regression Model

In [11]:
import keras

Using TensorFlow backend.


In [12]:
import random

In [None]:
class myData:
    valid_chars = valid_chars()
    def __init__(self, urls = [], train_pct=.8, filter_df = True):
        for index, url in enumerate(urls):
            if index == 0:
                self.data = get_socialblade(url)
            else:
                self.data = pd.concat([self.data, get_socialblade(url)])
        if filter_df:
            self.data = self.data[[all([i in self.valid_chars for i in name]) for name in self.data["Name"]]]
        self.data["encoded"] = self.data.apply(lambda row: encode(row["Name"]), axis=1)
        self.train = self.data.sample(frac=train_pct)
        self.test = self.data[[i not in self.train.index for i in self.data.index]]

In [87]:
class DataGenerator(keras.utils.Sequence):
    active_batch_gen = None
    
    def __init__(self, data, input_col, label_col, batch_size=32):
        self.data = data
        self.batch_size = batch_size
        self.label_col = label_col
        self.on_epoch_end()
        self.indexes = [i for i in range(len(self.data))]
        
    def __len_(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.data) / self.batch_size))
        
    def _batch_gen(self):
        while True:
            self.on_epoch_end()
            for index, i in enumerate(index_lst):
                yield self.train.iloc[i][self.input_col], self.train.iloc[i][self.label_col]
                
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
    
    def __data_generation(self):
        'Generate one batch of data'
        if not self.active_batch_gen:
            self.active_batch_gen = self._batch_gen(self.label_col)
        
        input_lst = []
        label_lst = []
        for i in range(self.batch_size):
            c_input, c_label = next(self.active_batch_gen)
            input_lst.append(c_input)
            label_lst.append(c_label)
        return input_lst, label_lst
        

In [85]:
socialblade_data = myData([good_url, bad_url])

In [88]:
# Generators
training_generator = DataGenerator(socialblade_data.train, "encoded", "Views")
validation_generator = DataGenerator(socialblade_data.test, "encoded", "Views")

In [None]:
1/0

In [89]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.datasets import make_regression
from sklearn.preprocessing import MinMaxScaler

In [91]:
model = Sequential()
model.add(Dense(36, input_dim=36*94, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mse', optimizer='adam')
model.fit_generator(
    generator=training_generator,
    validation_data=validation_generator,
    use_multiprocessing=True,
    workers=6
)

NotImplementedError: 

In [None]:
socialblade_data.data["encoded"].tolist()[0]

In [None]:
socialblade_data.data.iloc[1]["encoded"]