In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
import time
import warnings
import re
import urllib.parse
from urllib.parse import urljoin
from PIL import Image
from io import BytesIO
import os


#-----------------------------------------------------
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
#-----------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
#----------------------------------------------------
import joblib

warnings.filterwarnings("ignore")


In [5]:
# Load your cleaned dataset
df = pd.read_csv('wikipedia_content.csv')

# Load the outher dataset 
df2= pd.read_csv('wikipedia_photos.csv')

In [7]:
df

Unnamed: 0,title,url,cleaned_content,number_of_words_content
0,Cosmology,https://en.wikipedia.org/?curid=1864889,Cosmology (from Ancient Greek κόσμος (cosmos) ...,1369
1,Buddhist cosmology,https://en.wikipedia.org/?curid=5264082,Buddhist cosmology is the description of the s...,5001
2,Physical cosmology,https://en.wikipedia.org/?curid=5378,Physical cosmology is a branch of cosmology co...,3856
3,Religious cosmology,https://en.wikipedia.org/?curid=977209,Religious cosmology is an explanation of the o...,1703
4,Biblical cosmology,https://en.wikipedia.org/?curid=307968,Biblical cosmology is the account of the unive...,3380
...,...,...,...,...
495,Bel (mythology),https://en.wikipedia.org/?curid=93805,Bêl (/ˈbeɪl/; from Akkadian: bēlu) is a title ...,320
496,Traditional African religions,https://en.wikipedia.org/?curid=4655918,The beliefs and practices of African people ar...,2819
497,Asgard,https://en.wikipedia.org/?curid=1460,"In Nordic mythology, Asgard (Old Norse: Ásgarð...",1657
498,Vatican City,https://en.wikipedia.org/?curid=32408,This is an accepted version of this page Vatic...,7631


In [9]:
df2

Unnamed: 0,title,photo_url,photo_description
0,Cosmology,https:/static/images/icons/wikipedia.png,Image related to Cosmology
1,Cosmology,https:/static/images/mobile/copyright/wikipedi...,Image related to Cosmology
2,Cosmology,https:/static/images/mobile/copyright/wikipedi...,Image related to Cosmology
3,Cosmology,https://upload.wikimedia.org/wikipedia/commons...,Image related to Cosmology
4,Cosmology,https://upload.wikimedia.org/wikipedia/commons...,Image related to Cosmology
...,...,...,...
13838,Nag Hammadi library,https://upload.wikimedia.org/wikipedia/commons...,Image related to Nag Hammadi library
13839,Nag Hammadi library,https://upload.wikimedia.org/wikipedia/en/thum...,Image related to Nag Hammadi library
13840,Nag Hammadi library,https://login.wikimedia.org/wiki/Special:Centr...,Image related to Nag Hammadi library
13841,Nag Hammadi library,https:/static/images/footer/wikimedia-button.svg,Image related to Nag Hammadi library


In [11]:
# Handle missing values in the 'Content' column
df['cleaned_content'].fillna('', inplace=True)

# Clean the 'Content' column
def clean_text(text):
    if not isinstance(text, str):  # Ensure the text is a string
        return ''
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove non-alphanumeric characters
    return text

df['cleaned_content'] = df['cleaned_content'].apply(clean_text)

# Define celestial body types and keywords
celestial_types = {
    'Planet': ['planet', 'jupiter', 'earth', 'mars', 'venus', 'saturn', 'uranus', 'neptune', 'dwarf planet'],
    'Star': ['star', 'sun', 'nova', 'supernova', 'neutron star', 'red giant', 'pulsar', 'white dwarf'],
    'Moon': ['moon', 'satellite', 'luna', 'natural satellite'],
    'Asteroid': ['asteroid', 'comet', 'meteoroid', 'meteor', 'meteorite'],
    'Galaxy': ['galaxy', 'milky way', 'andromeda', 'spiral galaxy', 'elliptical galaxy', 'irregular galaxy'],
    'Nebula': ['nebula', 'emission nebula', 'reflection nebula', 'planetary nebula', 'dark nebula'],
    'Black Hole': ['black hole', 'event horizon', 'singularity'],
    'Constellation': ['constellation', 'zodiac'],
    'Exoplanet': ['exoplanet', 'extrasolar planet'],
    'Cosmic Structure': ['dark matter', 'dark energy', 'cosmos', 'universe', 'kuiper belt', 'oort cloud'],
    'Spacecraft': ['telescope', 'spacecraft', 'probe', 'rover', 'satellite'],
    'Astronomy Tools': ['astronomy', 'astrophysics', 'space exploration', 'observatory'],
    'Eclipse': ['eclipse', 'solar eclipse', 'lunar eclipse'],
    'Cosmology': ['cosmology', 'big bang', 'gravitational wave'],
}

# Classify celestial body types
def classify_celestial_body(title, content):
    title = title.lower()
    content = content.lower()
    
    for celestial_type, keywords in celestial_types.items():
        if any(keyword in title for keyword in keywords) or any(keyword in content for keyword in keywords):
            return celestial_type
    return 'Unknown'

df['Type'] = df.apply(lambda row: classify_celestial_body(row['title'], row['cleaned_content']), axis=1)

# Remove rows with 'Unknown' Type if needed
df = df[df['Type'] != 'Unknown']

# Save the cleaned dataset to a CSV file
df.to_csv('data_celestial_bodies2.csv', index=False)

In [13]:
df

Unnamed: 0,title,url,cleaned_content,number_of_words_content,Type
0,Cosmology,https://en.wikipedia.org/?curid=1864889,Cosmology from Ancient Greek cosmos the unive...,1369,Planet
1,Buddhist cosmology,https://en.wikipedia.org/?curid=5264082,Buddhist cosmology is the description of the s...,5001,Planet
2,Physical cosmology,https://en.wikipedia.org/?curid=5378,Physical cosmology is a branch of cosmology co...,3856,Planet
3,Religious cosmology,https://en.wikipedia.org/?curid=977209,Religious cosmology is an explanation of the o...,1703,Planet
4,Biblical cosmology,https://en.wikipedia.org/?curid=307968,Biblical cosmology is the account of the unive...,3380,Planet
...,...,...,...,...,...
494,Solar deity,https://en.wikipedia.org/?curid=27560,A solar deity or sun deity is a deity who repr...,4843,Planet
495,Bel (mythology),https://en.wikipedia.org/?curid=93805,Bl bel from Akkadian blu is a title signifying...,320,Planet
496,Traditional African religions,https://en.wikipedia.org/?curid=4655918,The beliefs and practices of African people ar...,2819,Star
497,Asgard,https://en.wikipedia.org/?curid=1460,In Nordic mythology Asgard Old Norse sgarr enc...,1657,Planet


In [17]:
# Function to check if a URL is valid using urllib
def is_valid_url(url):
    try:
        result = urllib.parse.urlparse(url)
        # Check if the URL has a valid scheme (http, https,...etc.) and netloc (domain)
        return all([result.scheme, result.netloc])
    except Exception as e:
        return False

# Apply the URL validation function to the 'URL' column
df['Is_Valid_URL'] = df['url'].apply(is_valid_url)

# Filter out invalid URLs
df = df[df['Is_Valid_URL'] == True]


In [19]:
df

Unnamed: 0,title,url,cleaned_content,number_of_words_content,Type,Is_Valid_URL
0,Cosmology,https://en.wikipedia.org/?curid=1864889,Cosmology from Ancient Greek cosmos the unive...,1369,Planet,True
1,Buddhist cosmology,https://en.wikipedia.org/?curid=5264082,Buddhist cosmology is the description of the s...,5001,Planet,True
2,Physical cosmology,https://en.wikipedia.org/?curid=5378,Physical cosmology is a branch of cosmology co...,3856,Planet,True
3,Religious cosmology,https://en.wikipedia.org/?curid=977209,Religious cosmology is an explanation of the o...,1703,Planet,True
4,Biblical cosmology,https://en.wikipedia.org/?curid=307968,Biblical cosmology is the account of the unive...,3380,Planet,True
...,...,...,...,...,...,...
494,Solar deity,https://en.wikipedia.org/?curid=27560,A solar deity or sun deity is a deity who repr...,4843,Planet,True
495,Bel (mythology),https://en.wikipedia.org/?curid=93805,Bl bel from Akkadian blu is a title signifying...,320,Planet,True
496,Traditional African religions,https://en.wikipedia.org/?curid=4655918,The beliefs and practices of African people ar...,2819,Star,True
497,Asgard,https://en.wikipedia.org/?curid=1460,In Nordic mythology Asgard Old Norse sgarr enc...,1657,Planet,True


In [21]:
encoder = LabelEncoder()
df['Type_Encoded'] = encoder.fit_transform(df['Type'])

In [25]:
df

Unnamed: 0,title,url,cleaned_content,number_of_words_content,Type,Is_Valid_URL,Type_Encoded
0,Cosmology,https://en.wikipedia.org/?curid=1864889,Cosmology from Ancient Greek cosmos the unive...,1369,Planet,True,6
1,Buddhist cosmology,https://en.wikipedia.org/?curid=5264082,Buddhist cosmology is the description of the s...,5001,Planet,True,6
2,Physical cosmology,https://en.wikipedia.org/?curid=5378,Physical cosmology is a branch of cosmology co...,3856,Planet,True,6
3,Religious cosmology,https://en.wikipedia.org/?curid=977209,Religious cosmology is an explanation of the o...,1703,Planet,True,6
4,Biblical cosmology,https://en.wikipedia.org/?curid=307968,Biblical cosmology is the account of the unive...,3380,Planet,True,6
...,...,...,...,...,...,...,...
494,Solar deity,https://en.wikipedia.org/?curid=27560,A solar deity or sun deity is a deity who repr...,4843,Planet,True,6
495,Bel (mythology),https://en.wikipedia.org/?curid=93805,Bl bel from Akkadian blu is a title signifying...,320,Planet,True,6
496,Traditional African religions,https://en.wikipedia.org/?curid=4655918,The beliefs and practices of African people ar...,2819,Star,True,8
497,Asgard,https://en.wikipedia.org/?curid=1460,In Nordic mythology Asgard Old Norse sgarr enc...,1657,Planet,True,6


In [33]:
# Rename and reorder the columns
df = df.rename(columns={
    'title': 'Title',
    'url': 'URL',
    'cleaned_content': 'Cleaned_Content',
    'number_of_words_content': 'Word_Count',
    'Type': 'Type',
    'Is_Valid_URL': 'Is_Valid_URL',
    'Type_Encoded': 'Type_Encoded'
})[["Title", "URL", "Cleaned_Content", "Type", "Word_Count", "Type_Encoded", "Is_Valid_URL"]]

# Display the updated DataFrame
print(df.head())


                 Title                                      URL  \
0            Cosmology  https://en.wikipedia.org/?curid=1864889   
1   Buddhist cosmology  https://en.wikipedia.org/?curid=5264082   
2   Physical cosmology     https://en.wikipedia.org/?curid=5378   
3  Religious cosmology   https://en.wikipedia.org/?curid=977209   
4   Biblical cosmology   https://en.wikipedia.org/?curid=307968   

                                     Cleaned_Content    Type  Word_Count  \
0  Cosmology from Ancient Greek  cosmos the unive...  Planet        1369   
1  Buddhist cosmology is the description of the s...  Planet        5001   
2  Physical cosmology is a branch of cosmology co...  Planet        3856   
3  Religious cosmology is an explanation of the o...  Planet        1703   
4  Biblical cosmology is the account of the unive...  Planet        3380   

   Type_Encoded  Is_Valid_URL  
0             6          True  
1             6          True  
2             6          True  
3           

In [31]:
print(df.columns)


Index(['title', 'url', 'cleaned_content', 'number_of_words_content', 'Type',
       'Is_Valid_URL', 'Type_Encoded'],
      dtype='object')


In [35]:
df.to_csv('data_celestial_bodies2.csv', index=False)

In [37]:
# Load your cleaned dataset
df3 = pd.read_csv('data_celestial_bodies2.csv')

In [39]:
df3

Unnamed: 0,Title,URL,Cleaned_Content,Type,Word_Count,Type_Encoded,Is_Valid_URL
0,Cosmology,https://en.wikipedia.org/?curid=1864889,Cosmology from Ancient Greek cosmos the unive...,Planet,1369,6,True
1,Buddhist cosmology,https://en.wikipedia.org/?curid=5264082,Buddhist cosmology is the description of the s...,Planet,5001,6,True
2,Physical cosmology,https://en.wikipedia.org/?curid=5378,Physical cosmology is a branch of cosmology co...,Planet,3856,6,True
3,Religious cosmology,https://en.wikipedia.org/?curid=977209,Religious cosmology is an explanation of the o...,Planet,1703,6,True
4,Biblical cosmology,https://en.wikipedia.org/?curid=307968,Biblical cosmology is the account of the unive...,Planet,3380,6,True
...,...,...,...,...,...,...,...
482,Solar deity,https://en.wikipedia.org/?curid=27560,A solar deity or sun deity is a deity who repr...,Planet,4843,6,True
483,Bel (mythology),https://en.wikipedia.org/?curid=93805,Bl bel from Akkadian blu is a title signifying...,Planet,320,6,True
484,Traditional African religions,https://en.wikipedia.org/?curid=4655918,The beliefs and practices of African people ar...,Star,2819,8,True
485,Asgard,https://en.wikipedia.org/?curid=1460,In Nordic mythology Asgard Old Norse sgarr enc...,Planet,1657,6,True
