In [519]:
from bs4 import BeautifulSoup
import requests
import os
import datetime
import time

In [424]:
import pandas as pd
import numpy as np

In [425]:
import logging

In [440]:
 logging.basicConfig(level=logging.ERROR,
                            format="%(asctime)s %(levelname)s %(threadName)s %(name)s %(message)s",
                            filename='acquirefunc.log')

# NYAngler Scrape

In [363]:
nyanger_url = 'https://nyangler.com'

In [511]:
data_folder = '/home/desbrium/Metis/FishingSpots/Data/NyAngler/'

In [364]:
response = requests.get(nyanger_url + '/forums/')
page = response.text
soup = BeautifulSoup(page,'lxml')

forum_threads = soup.find_all('div', class_ = 'block-container')
saltwater_threads = forum_threads[3].find_all('h3', class_ = 'node-title')
freshwater_threads = forum_threads[4].find_all('h3', class_ = 'node-title')

forum_link_dict = {'Thread_Name':['Fishing Report', 'New York State DEC Updates', 'New York Fishing Podcast'], 
                   'Thread_Link':['/forums/fishing-report.93/', '/forums/new-york-state-dec-updates.91/', '/forums/new-york-fishing-podcast.68/'],
                   'Thread_Category':['Information', 'Information', 'Information']}

forum_link_dict['Thread_Link'].extend([thread.find('a').get('href') for thread in saltwater_threads])
forum_link_dict['Thread_Name'].extend([thread.text.replace('New', '').strip() for thread in saltwater_threads])
forum_link_dict['Thread_Category'].extend(['SaltWater_Fishing' for thread in saltwater_threads])

forum_link_dict['Thread_Link'].extend([thread.find('a').get('href') for thread in freshwater_threads])
forum_link_dict['Thread_Name'].extend([thread.text.replace('New', '').strip() for thread in freshwater_threads])
forum_link_dict['Thread_Category'].extend(['FreshWater_Fishing' for thread in freshwater_threads])

In [365]:
thread_link_dict = {'Thread_Name': [], 'Page_Links':[]}

link_indexes = len(forum_link_dict['Thread_Link'])
thread_links = forum_link_dict['Thread_Link']
thread_names = forum_link_dict['Thread_Name']

for index in range(link_indexes):
    
    thread_link = thread_links[index]
    thread_name = thread_names[index]
    
    response = requests.get(f'{nyanger_url + thread_link}')
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    
    try: 
        
        last_page = soup.find_all('li', class_ = 'pageNav-page')[-1].text
    
    except:
        
        continue 
        
    for page_num in range(1,int(last_page)+1):
        
        if thread_link not in thread_link_dict['Page_Links']:
            
            thread_link_dict['Page_Links'].append(thread_link)
            thread_link_dict['Thread_Name'].append(thread_name)
        
        else:
            
            thread_link_dict['Page_Links'].append(thread_link+f'page-{page_num}')
            thread_link_dict['Thread_Name'].append(thread_name)
        

In [366]:
post_link_dict = {'Thread_Name': [], 'Page_Links':[], 'Post_Links':[]}

link_indexes = len(thread_link_dict['Page_Links'])
page_links = thread_link_dict['Page_Links']
thread_names = thread_link_dict['Thread_Name']

for index in range(link_indexes):
    
    page_link = page_links[index]
    thread_name = thread_names[index]
    
    response = requests.get(f'{nyanger_url + page_link}')
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    
    table = soup.find('div', class_ ='structItemContainer')
    post_containers = table.find_all('li', class_ = 'structItem-startDate')
        
    for post in post_containers:
        
        post_link_dict['Thread_Name'].append(thread_name)
        post_link_dict['Page_Links'].append(page_link)
        post_link_dict['Post_Links'].append(post.find('a').get('href'))

In [561]:
def acquire_post_info(base_url = nyanger_url, post_link = None):
        
    post_dict = {'Post_Links':'', 'Post_Title': '', 'Post_DateTime': '', 'Post_Content': '', 'Post_Content_Dates': []}

    page_links = []
    
    page_links.append(f'{base_url + post_link}')

    response = requests.get(page_links[0])
    
    if int(response.status_code) < 400:
        
        page = response.text
        soup = BeautifulSoup(page,'lxml')

        table = soup.find('div', class_ = 'p-body-inner')

        post_dict['Post_Links'] += post_link
        
        post_title = table.find('h1').text 
        post_dict['Post_Title'] += post_title

        try: 

            last_page_num = int(table.find_all('li', class_ = 'pageNav-page')[-1].text)

            if last_page_num > 2:

                for page_num in range(2, last_page_num + 1):

                    page_links.append(f'{base_url + post_link}' + f'page-{page_num}')

            else: 

                page_links.append(f'{base_url + post_link}' + f'page-2')

        except:

            pass

        for page_link in page_links:

            response = requests.get(page_link)
            page = response.text
            soup = BeautifulSoup(page,'lxml')

            table = soup.find('div', class_ = 'p-body-inner')

            try:

                for blockquotes in table.find_all('blockquote'):

                    blockquotes.decompose()
            except:

                print(page_link)

                pass

            posts = table.find_all('div', class_ = 'message-cell message-cell--main')

            for post in posts:

                post_text = post.find('div', class_ = 'bbWrapper').text.strip()

                #logging.error(f'{page_link}{post}')

                if len(post_text) > 30:

                    post_time = post.find('time').get('title')
                    post_dict['Post_Content'] += post_text
                    post_dict['Post_Content_Dates'].append(post_time)
            
            try:
                
                post_dict['Post_DateTime'] += post_dict['Post_Content_Dates'][0]
    
            except:
                
                continue
    else:
            
        print(f'{response.status_code}')
    
    return post_dict

In [552]:
def create_nyangler_df(post_links = post_link_dict['Post_Links'], start_num = None, end_num = None, download_folder = data_folder):
    
    forum_info = []
    
    for link in post_links[start_num : end_num]:
        
        forum_info.append(acquire_post_info(post_link = link))
    
    df = pd.DataFrame(forum_info)
    
    file_name = f'NYAnglerPostData{start_num}{end_num}{datetime.date.today().strftime("%m-%d-%y")}.pkl'

    file_path = os.path.join(data_folder,file_name)
    
    df.to_pickle(file_path)
    
    logging.error(f'{start_num}{end_num}')
    
    return file_path

In [553]:
def post_chunks(post_links = post_link_dict['Post_Links']):
    
    post_chunks = []
    
    length = len(post_links)
    post_bins = np.linspace(0, length, num= length//40, dtype=np.int64)
    
    for idx, post_bin in enumerate(post_bins):

        if idx == len(post_bins) - 1: 

            break

        start_num =  post_bins[idx]
        end_num = post_bins[idx+1]
        
        post_chunks.append((start_num, end_num))
        
    return post_chunks

In [554]:
def create_post_info(post_links = post_link_dict['Post_Links'], chunk_list = post_chunks(), start_index = None):
        
        for index,chunk in enumerate(chunk_list[start_index:]):
            
            start_num, end_num = chunk
        
            yield create_nyangler_df(start_num = start_num, end_num = end_num)

In [563]:
# data_list = []

# gen = create_post_info(start_index = 23)

# for postinfo in gen:
#     data_list.append(postinfo)
#     time.sleep(5)

In [566]:
# df_list = []

# for data in data_list:
#     df_list.append(pd.read_pickle(data))

# df = pd.concat(df_list, axis = 0)
# nyangler_df = pd.DataFrame(post_link_dict).merge(pd.DataFrame(forum_link_dict), how = 'left', on = 'Thread_Name').merge(df, how = 'left', on = 'Post_Links')

In [567]:
# file_name = f'NYAnglerPostData{datetime.date.today().strftime("%m-%d-%y")}.csv'

# file_path = os.path.join(data_folder,file_name)

# nyangler_df.to_csv(file_path, index = False)

'/home/desbrium/Metis/FishingSpots/Data/NyAngler/NYAnglerPostData06-22-21.csv'