In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import files
import warnings

warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made')

def fetch_and_save_table_to_excel(url, output_file):
    response = requests.get(url, verify=False)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table')

    headers = [header.text.strip() for header in table.find_all('th')]
    rows = []

    for row in table.find_all('tr'):
        columns = [col.text.strip() for col in row.find_all('td')]
        if len(columns) > 0:
            rows.append(columns)

    try:
        df = pd.DataFrame(rows, columns=headers)
    except ValueError as e:
        print(f"Error creating DataFrame: {e}")
        max_cols = max(len(row) for row in rows)
        df = pd.DataFrame(rows, columns=headers[:max_cols])

    df.to_excel(output_file, index=False)
    files.download(output_file)

    print(f"Data has been saved to {output_file} and is ready for download.")

url = 'https://cfr.annauniv.edu/research/academics/journals-list.php'
output_file = 'table_data.xlsx'
fetch_and_save_table_to_excel(url, output_file)


Error creating DataFrame: 8 columns passed, passed data had 6 columns


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Data has been saved to table_data.xlsx and is ready for download.


In [2]:
import pandas as pd
from google.colab import files

uploaded = files.upload()

for file_name in uploaded.keys():
    df = pd.read_excel(file_name)

print("First few rows of the dataset:")
print(df.head())

df.columns = ['S.No', 'Full Journal Title', 'Print-ISSN', 'E-ISSN', 'Publisher', 'Country']

print("\nBasic Information about the dataset:")
print("\nSummary of missing values per column:")
print(df.isnull().sum())

print("\nUnique value counts per column:")
print(df.nunique())

print("\nDataset Statistics:")
print(df.describe())

print("\nTop 5 most frequent Publishers:")
print(df['Publisher'].value_counts().head(5))

print("\nTop 5 Countries represented in the dataset:")
print(df['Country'].value_counts().head(5))


Saving table_data.xlsx to table_data (1).xlsx
First few rows of the dataset:
   Common for all registered Ph.D. scholars irrespective of the time of their registration  \
0                                                  1                                         
1                                                  2                                         
2                                                  3                                         
3                                                  4                                         
4                                                  5                                         

  FAKE JOURNALS-IMPORTANT GUIDELINE (  Click Here )      Sl.No  \
0                       ACADEMIC EMERGENCY MEDICINE  1069-6563   
1                                 ACADEMIC MEDICINE  1040-2446   
2                                ACADEMIC RADIOLOGY  1076-6332   
3                      ACADEMY OF MANAGEMENT ANNALS  1941-6520   
4                     ACADEM