# Importing the required packages

In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.request
from contextlib import closing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from selenium import webdriver
from itertools import chain

### url which has to be used for scraping

In [112]:
#url = 'https://mvgee.com/home'
url = 'https://gogoanime.in'

### the function to retrieve the text content of the webpage

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except requests.RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [114]:
#page is a requests object which holds the downloaded html page
page = simple_get(url)
#print(page)

In [115]:
#creating the BeautifulSoup object
soup = BeautifulSoup(page, 'html.parser')

### checking the ongoing tv series.

In [116]:
# retrieving the ongoing series name and link in the right side menu...

#reading the content of the division tag where the links of ongoing series are available
ongoingseries = soup.find_all('div', class_="overview")

#reading all the links in a list object from the division bs4.object.Tag ongoingseries
links= list(chain.from_iterable([n.find_all('a') for n in ongoingseries]))

#creating a dataframe to store all the ongoing series with name and url link
ongoingseries_df = pd.DataFrame(data=None, columns=['TVseriesName', 'Link'],index=None)

#using for loop to extract all the urls and tv series name and storing it 1 by 1 into the dataframe
for link in links:
    ongoingseries_df = ongoingseries_df.append({'TVseriesName': link.get_text(), 'Link': url+link.get('href')}, ignore_index=True)

#printing the first five values in the dataframe to check if it stored properly or  not
ongoingseries_df.head()

Unnamed: 0,TVseriesName,Link
0,3D Kanojo: Real Girl,https://gogoanime.in/category/3d-kanojo-real-girl
1,Aikatsu Friends!,https://gogoanime.in/category/aikatsu-friends
2,Aishen Qiaokeli-ing... 2nd Season,https://gogoanime.in/category/aishen-qiaokeli-...
3,Akkun to Kanojo,https://gogoanime.in/category/akkun-to-kanojo
4,Alice or Alice: Siscon Niisan to Futago no Imouto,https://gogoanime.in/category/alice-or-alice-s...


### Checking the recently added tv series

In [117]:
#reading the content of the division tag where the links of ongoing series are available
recentlyAddedSeries = soup.find_all('div', class_="added_series_body final")

#reading all the links in a list object from the division bs4.object.Tag ongoingseries
links= list(chain.from_iterable([n.find_all('a') for n in recentlyAddedSeries]))

#creating a dataframe to store all the ongoing series with name and url link
recentlyAddedSeries_df = pd.DataFrame(data=None, columns=['TVseriesName', 'Link'],index=None)

#using for loop to extract all the urls and tv series name and storing it 1 by 1 into the dataframe
for link in links:
    #print(link)
    recentlyAddedSeries_df = recentlyAddedSeries_df.append({'TVseriesName': link.get_text(), 'Link': url+link.get('href')}, ignore_index=True)

#printing the first five values in the dataframe to check if it stored properly or  not
recentlyAddedSeries_df.head()

Unnamed: 0,TVseriesName,Link
0,Uchuu Senkan Yamato 2202: Ai no Senshi-tachi,https://gogoanime.in/category/uchuu-senkan-yam...
1,Full Metal Panic! Invisible Victory (Dub),https://gogoanime.in/category/full-metal-panic...
2,Douluo Dalu - Soul Land,https://gogoanime.in/category/douluo-dalu-soul...
3,Yaoguai Mingdan Season 1,https://gogoanime.in/category/yaoguai-mingdan-...
4,Yaoguai Mingdan Season 2,https://gogoanime.in/category/yaoguai-mingdan-...


### now extracting all the anime tv series from link - https://gogoanime.in/anime-list.html
Note - to go to this url, click on anime list on top of the main page.

In [3]:
url = 'https://gogoanime.in/anime-list.html'

In [4]:
#page is a requests object which holds the downloaded html page
page = simple_get(url)
#print(page)

In [5]:
#creating the BeautifulSoup object
soup = BeautifulSoup(page, 'html.parser')

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en-US" xmlns="http://www.w3.org/1999/xhtml">
 <head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article#">
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <link href="http://gmpg.org/xfn/11" rel="profile"/>
  <link href="/favicon.png" rel="shortcut icon"/>
  <meta content="shHYF9VOwfpFOdBpleH5jZ2CBuy6nVjX4PrkxMSRqHQ" name="google-site-verification">
   <title>
    List All Anime at Gogoanime | Anime List
   </title>
   <meta content="noodp, noydir" name="robots"/>
   <meta content="List All Anime  at Gogoanime | Anime List" name="description"/>
   <meta content="List All Anime  at Gogoanime | Anime List" name="keywords"/>
   <meta content="/images/logo.png" itemprop="image">
    <meta content="Gogoanime" property="og:site_name">
     <meta content="en_US" property="og:locale"/>
     <meta content="website

In [52]:
#reading the content of the division tag where the links of ongoing series are available
animelist = soup.find_all('div', class_="anime_list_body")
#print(animelist)

#reading all the linklist in a list object from the division bs4.object.Tag animelist
linklist= list(chain.from_iterable([n.find_all('li') for n in animelist]))

trialsoup = BeautifulSoup(linklist[0].get('title'), "html.parser") 

# Genres of the anime tv series
print(trialsoup.find_all('p',class_='type')[0].get_text())
#finding link of the genres.
print(trialsoup.find_all('p',class_='type')[0].find_all('a')[0].get('href'))
# Released year of the anime tv series
print(trialsoup.find_all('p',class_='type')[1].get_text())

# Status of the anime tv series
print(trialsoup.find_all('p',class_='type')[2].get_text())
# Plot Summary of the anime tv series
print(trialsoup.find('p',class_='sumer').get_text())



'''
#creating a dataframe to store all the ongoing series with name and url link
recentlyAddedSeries_df = pd.DataFrame(data=None, columns=['TVseriesName', 'Link'],index=None)

#using for loop to extract all the urls and tv series name and storing it 1 by 1 into the dataframe
for link in links:
    #print(link)
    recentlyAddedSeries_df = recentlyAddedSeries_df.append({'TVseriesName': link.get_text(), 'Link': url+link.get('href')}, ignore_index=True)

#printing the first five values in the dataframe to check if it stored properly or  not
ongoingseries_df.head() '''

Genre: Drama, School, Shounen Ai, Slice of Life
http://gogoanime.in/genre/drama
Released: 2006
Status:  Completed
Plot Summary:  Life goes on for the ladies in the Yamayurikai. On the one-year anniversary of Sachiko and Yumi becoming sisters, Yumi receives a difficult task from Sachiko: to finally acknowledge someone as her younger sister.
 


"\n#creating a dataframe to store all the ongoing series with name and url link\nrecentlyAddedSeries_df = pd.DataFrame(data=None, columns=['TVseriesName', 'Link'],index=None)\n\n#using for loop to extract all the urls and tv series name and storing it 1 by 1 into the dataframe\nfor link in links:\n    #print(link)\n    recentlyAddedSeries_df = recentlyAddedSeries_df.append({'TVseriesName': link.get_text(), 'Link': url+link.get('href')}, ignore_index=True)\n\n#printing the first five values in the dataframe to check if it stored properly or  not\nongoingseries_df.head() "

In [None]:
'anime_list_body'

In [89]:
links= []
for n in ongoingseries:
    print(type(n))
    links = n.find_all('a')
print(links[0])

<class 'bs4.element.Tag'>
<a href="/category/3d-kanojo-real-girl" title=" 3D Kanojo: Real Girl"> 3D Kanojo: Real Girl</a>


In [8]:
driver = webdriver.Chrome('C:\Program Files\chromedriver_win32\chromedriver.exe')
driver.get(url)
time.sleep(5)
driver.find_element_by_class_name("recaptcha-checkbox-border").click()
time.sleep(5)
#elem1 = driver.find_element_by_link_text(url)
#elem1.click()
#time.sleep(5)
#driver.quit()

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"class name","selector":"recaptcha-checkbox-border"}
  (Session info: chrome=66.0.3359.181)
  (Driver info: chromedriver=2.38.552522 (437e6fbedfa8762dec75e2c5b3ddb86763dc9dcb),platform=Windows NT 6.1.7601 SP1 x86_64)
