In [1]:
from google.oauth2 import service_account
from googleapiclient.discovery import build

# Load service account credentials
SERVICE_ACCOUNT_FILE = "path/to/your-service-account.json"
SCOPES = ["https://www.googleapis.com/auth/webmasters.readonly"]

credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES
)

# Build the Search Console API client
service = build("searchconsole", "v1", credentials=credentials)


In [26]:
from google_auth_oauthlib.flow import InstalledAppFlow
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
import pandas as pd
from datetime import datetime, timedelta
import os
import pickle
import time

class SearchConsoleAPI:
    def __init__(self):
        self.SCOPES = ['https://www.googleapis.com/auth/webmasters.readonly']
        self.credentials = None
        self.service = None
        self.ROW_LIMIT = 25000

    def authenticate(self, client_secrets_file):
        """Authenticate using OAuth 2.0"""
        creds = None
        token_file = 'token.pickle'
        
        if os.path.exists(token_file):
            with open(token_file, 'rb') as token:
                creds = pickle.load(token)

        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(
                    client_secrets_file, self.SCOPES)
                creds = flow.run_local_server(port=0)
            
            with open(token_file, 'wb') as token:
                pickle.dump(creds, token)

        self.credentials = creds
        self.service = build('searchconsole', 'v1', credentials=creds)
        return True

    def get_site_list(self):
        """Get list of available sites"""
        try:
            sites = self.service.sites().list().execute()
            return sites.get('siteEntry', [])
        except Exception as e:
            print(f"Error getting sites: {str(e)}")
            return []

    def fetch_data_chunk(self, site_url, start_date, end_date, dimensions, start_row=0):
        """Fetch a chunk of data"""
        request = {
            'startDate': start_date,
            'endDate': end_date,
            'dimensions': dimensions,
            'rowLimit': self.ROW_LIMIT,
            'startRow': start_row
        }

        try:
            response = self.service.searchanalytics().query(
                siteUrl=site_url,
                body=request
            ).execute()
            return response
        except Exception as e:
            print(f"Error fetching chunk starting at row {start_row}: {str(e)}")
            return None

    def fetch_all_data(self, site_url, start_date, end_date, dimensions, 
                      max_retries=3, delay_between_chunks=1):
        """Fetch all data with pagination and retry logic"""
        all_rows = []
        start_row = 0
        total_rows_fetched = 0
        
        while True:
            retry_count = 0
            chunk_data = None
            
            while retry_count < max_retries and chunk_data is None:
                if retry_count > 0:
                    print(f"Retrying chunk (attempt {retry_count + 1}/{max_retries})...")
                    time.sleep(delay_between_chunks * 2)
                
                chunk_data = self.fetch_data_chunk(
                    site_url, start_date, end_date, dimensions, start_row)
                retry_count += 1

            if chunk_data is None:
                print(f"Failed to fetch chunk after {max_retries} attempts")
                break

            rows = chunk_data.get('rows', [])
            if not rows:
                break

            all_rows.extend(rows)
            total_rows_fetched += len(rows)
            
            print(f"Fetched {len(rows)} rows (Total: {total_rows_fetched})")
            
            if len(rows) < self.ROW_LIMIT:
                break
                
            start_row += self.ROW_LIMIT
            time.sleep(delay_between_chunks)

        return all_rows

def process_data(rows, dimensions):
    """Process the raw data into a DataFrame"""
    processed_rows = []
    
    for row in rows:
        row_data = {}
        
        # Add dimensions
        for i, dimension in enumerate(dimensions):
            row_data[dimension] = row['keys'][i]
            
        # Add metrics
        row_data.update({
            'clicks': row['clicks'],
            'impressions': row['impressions'],
            'ctr': row['ctr'],
            'position': row['position']
        })
        
        processed_rows.append(row_data)
    
    return pd.DataFrame(processed_rows)

def main():
    # Initialize API client
    api = SearchConsoleAPI()
    
    # Your OAuth client secrets file path
    client_secrets_file = 'client_secret_81929404806-p1kl4usbb9llq1j4pbvg35a08toljnn3.apps.googleusercontent.com.json'  # Update this path
    
    # Authenticate
    print("Authenticating...")
    if not api.authenticate(client_secrets_file):
        print("Authentication failed")
        return

    # Get available sites
    sites = api.get_site_list()
    if not sites:
        print("No sites available")
        return

    print("\nAvailable sites:")
    for site in sites:
        print(f"- {site['siteUrl']}")

    # Use the correct site URL format
    site_url = 'sc-domain:scholistico.com'  # Using the domain property format
    
    # Set date range (past 90 days)
    end_date = datetime.now().strftime('%Y-%m-%d')
    start_date = (datetime.now() - timedelta(days=90)).strftime('%Y-%m-%d')
    
    dimensions = ['date', 'query', 'page', 'device', 'country']

    print(f"\nFetching all data for {site_url}")
    print(f"Date range: {start_date} to {end_date}")
    print(f"Dimensions: {', '.join(dimensions)}")

    # Fetch all data
    rows = api.fetch_all_data(
        site_url=site_url,
        start_date=start_date,
        end_date=end_date,
        dimensions=dimensions,
        delay_between_chunks=1
    )

    if not rows:
        print("No data received")
        return

    print(f"\nTotal rows fetched: {len(rows)}")

    # Process data
    print("\nProcessing data...")
    df = process_data(rows, dimensions)

    # Save data
    output_file = f'search_console_data_{start_date}_to_{end_date}.csv'
    print(f"\nSaving data to {output_file}...")
    df.to_csv(output_file, index=False)

    print("\nData Summary:")
    print(f"Total rows: {len(df)}")
    print("\nSample of the data:")
    print(df.head())
    print(f"\nData successfully saved to {output_file}")

if __name__ == "__main__":
    main()

Authenticating...

Available sites:
- sc-domain:scholistico.com

Fetching all data for sc-domain:scholistico.com
Date range: 2024-10-11 to 2025-01-09
Dimensions: date, query, page, device, country
Fetched 25000 rows (Total: 25000)
Fetched 25000 rows (Total: 50000)
Fetched 25000 rows (Total: 75000)
Fetched 25000 rows (Total: 100000)
Fetched 25000 rows (Total: 125000)
Fetched 25000 rows (Total: 150000)
Fetched 25000 rows (Total: 175000)
Fetched 25000 rows (Total: 200000)
Fetched 25000 rows (Total: 225000)
Fetched 25000 rows (Total: 250000)
Fetched 25000 rows (Total: 275000)
Fetched 25000 rows (Total: 300000)
Fetched 25000 rows (Total: 325000)
Fetched 25000 rows (Total: 350000)
Fetched 25000 rows (Total: 375000)
Fetched 25000 rows (Total: 400000)
Fetched 25000 rows (Total: 425000)
Fetched 25000 rows (Total: 450000)
Fetched 25000 rows (Total: 475000)
Fetched 25000 rows (Total: 500000)
Fetched 25000 rows (Total: 525000)
Fetched 25000 rows (Total: 550000)
Fetched 25000 rows (Total: 575000)
F

In [63]:
import pandas as pd


df=pd.read_csv('search_console_data_2024-10-11_to_2025-01-09.csv')

In [64]:
list_of_blogs=df.loc[df.page.str.contains('/\d+') &  ~(df.page.str.contains('product|blog|groups|course|produkt|cursos|topic|category|shop|page|ref|Corsi|kurser|fr.scholistico|fi.scholistico|dk.scholistico|no.scholistico'))].page.unique()

  list_of_blogs=df.loc[df.page.str.contains('/\d+') &  ~(df.page.str.contains('product|blog|groups|course|produkt|cursos|topic|category|shop|page|ref|Corsi|kurser|fr.scholistico|fi.scholistico|dk.scholistico|no.scholistico'))].page.unique()


In [65]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd

# Function to check if 'DALL-E prompt:' exists in comments
def check_dalle_prompt(url):
    try:
        # Send a GET request to the blog URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes (4xx or 5xx)
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all comments in the HTML
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        
        # Check if any of the comments contain 'DALL-E prompt:'
        for comment in comments:
            if 'DALL-E prompt:' in comment:
                return "AI"
        
        # If 'DALL-E prompt:' is not found in comments, return "manual"
        return "manual"
    
    except requests.RequestException as e:
        # If there's an error in fetching the page (e.g., network error, invalid URL)
        print(f"Error fetching {url}: {e}")
        return "error"

# List of blog URLs to check

# Prepare a list to store results
results = []

# Iterate through each blog URL and check for DALL-E prompt in comments
for blog in list_of_blogs:
    blog_type = check_dalle_prompt(blog)
    results.append({"blog_link": blog, "type": blog_type})

# Convert results to a pandas DataFrame
df_1 = pd.DataFrame(results)

# Display the DataFrame
print(df_1)

# Optionally, save the result to a CSV file
df_1.to_csv('blog_check_results.csv', index=False)


                                            blog_link    type
0   https://scholistico.com/10-best-art-therapy-ac...  manual
1   https://scholistico.com/8-best-art-therapy-exe...  manual
2   https://it.scholistico.com/10-esercizi-di-arte...  manual
3   https://scholistico.com/10-art-therapy-exercis...  manual
4   https://it.scholistico.com/7-migliori-esercizi...  manual
..                                                ...     ...
94  https://es.scholistico.com/7-maneras-de-crear-...      AI
95  https://es.scholistico.com/7-ejercicios-de-art...      AI
96  https://scholistico.com/8-natural-approaches-t...      AI
97  https://de.scholistico.com/7-kunsttherapieuebu...      AI
98  https://de.scholistico.com/7-kunsttherapieuebu...      AI

[99 rows x 2 columns]


In [67]:
df=pd.merge(df,df_1,left_on='page',right_on='blog_link',how='left')

In [70]:
df.dropna(subset=['blog_link']).to_csv('final_plotly_data.csv',index=False)

In [18]:
import json

def print_service_account_email(key_file_path):
    with open(key_file_path, 'r') as f:
        key_data = json.load(f)
        print(f"Service Account Email: {key_data['client_email']}")

# Use this to get your service account email
key_file_path = 'client_secret_81929404806-p1kl4usbb9llq1j4pbvg35a08toljnn3.apps.googleusercontent.com.json'
print_service_account_email(key_file_path)

KeyError: 'client_email'