diff --git a/KnowYourMemesParser.py b/KnowYourMemesParser.py old mode 100755 new mode 100644 index 0dc19b2..650fa7e --- a/KnowYourMemesParser.py +++ b/KnowYourMemesParser.py @@ -1,234 +1,230 @@ -# -*- coding: utf-8 -*- -""" -Created on Tue Sep 6 17:40:12 2016 -@author: dmitrys -""" +# coding: utf-8 -############################################################### -#### Meme parsing just for lulz and science <3 #### -############################################################### +# authors: @DmitrySerg, @FUlyankin +# repository: https://github.com/DmitrySerg/memology +import requests # отправка запросов +import numpy as np # матрицы, вектора и линал +import pandas as pd # таблички и операции с ними +import time # время -import re -from bs4 import BeautifulSoup -import time -import pandas as pd -import numpy as np -from urllib.request import Request, urlopen -import getpass -import sys -sys.path.append('/Users/dmitrys/anaconda2/lib/python2.7/site-packages') -username = getpass.getuser() - -#### Send me a letter -import smtplib -from email.mime.multipart import MIMEMultipart -from email.mime.text import MIMEText -#from email.mime.base import MIMEBase -#from email import encoders -import requests -import sys +from tqdm import tqdm # мониторинг прогресса +from fake_useragent import UserAgent # генерация правдоподобных юзер-агентов +from bs4 import BeautifulSoup # очень красивый суп для обработки html +import argparse # чтение аргументов из коммандной строки -#### GOING TO TOR -import socks +import socks # подключение к тору import socket -socks.set_default_proxy(socks.SOCKS5, "localhost", 9150) -socket.socket = socks.socksocket -#print(urlopen('http://icanhazip.com').read()) - - -from fake_useragent import UserAgent - -def generateUserAgent(): - return UserAgent().chrome - -def html_stripper(text): - return re.sub('<[^<]+?>', '', str(text)) - - -number_of_pages = 369 -page = 1 -main_url = 'http://knowyourmeme.com/' -columns = ['name', 'added', 'views', 'comments', 'status', 'year', 'tags', 'about', 'origin', 'spread'] -FINAL = pd.DataFrame(columns=columns) -START = time.time() - -def getMemeUrls(page): - req = Request('http://knowyourmeme.com/memes/all/page/{}'.format(page), headers={'User-Agent': generateUserAgent()}) - webpage = urlopen(req).read() - soup = BeautifulSoup(webpage, "lxml") - meme_urls = soup.findAll('a', attrs={'class':'photo'}) - print('Getting all memes from page {}'.format(page)) - return meme_urls - - +import sys -def getAllFromPage(meme_urls): - global FINAL - count = 0 - start = time.time() - current_shape = FINAL.shape - for meme in meme_urls: - count += 1 +def getPageLinks(page_number): + """ + Возвращает список ссылок на мемы, полученный с текущей страницы - to_append = {x:np.NaN for x in columns} - #time.sleep(1) - try: - meme_url = re.split('href="|" target="|">